From 6129c9a56d52ebb060417cb397e0764cdd8791bc Mon Sep 17 00:00:00 2001 From: liferoad Date: Mon, 7 Oct 2024 11:05:34 -0400 Subject: [PATCH 01/14] allow numpy 2.1.x --- sdks/python/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/python/setup.py b/sdks/python/setup.py index 721cb4c1a8dd7..6ce7cfdfd556e 100644 --- a/sdks/python/setup.py +++ b/sdks/python/setup.py @@ -366,7 +366,7 @@ def get_portability_package_data(): 'jsonpickle>=3.0.0,<4.0.0', # numpy can have breaking changes in minor versions. # Use a strict upper bound. - 'numpy>=1.14.3,<1.27.0', # Update pyproject.toml as well. + 'numpy>=1.14.3,<2.2.0', # Update pyproject.toml as well. 'objsize>=0.6.1,<0.8.0', 'packaging>=22.0', 'pymongo>=3.8.0,<5.0.0', From b8accd2583c6b89acf03fcdeccf13895ae77ef0c Mon Sep 17 00:00:00 2001 From: liferoad Date: Mon, 7 Oct 2024 16:49:37 -0400 Subject: [PATCH 02/14] fixed the mypy --- sdks/python/apache_beam/ml/inference/tensorrt_inference.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sdks/python/apache_beam/ml/inference/tensorrt_inference.py b/sdks/python/apache_beam/ml/inference/tensorrt_inference.py index b38947b494c20..9563aa05232a5 100644 --- a/sdks/python/apache_beam/ml/inference/tensorrt_inference.py +++ b/sdks/python/apache_beam/ml/inference/tensorrt_inference.py @@ -125,7 +125,7 @@ def __init__(self, engine: trt.ICudaEngine): # TODO(https://github.com/NVIDIA/TensorRT/issues/2557): # Clean up when fixed upstream. try: - _ = np.bool # type: ignore + _ = np.bool except AttributeError: # numpy >= 1.24.0 np.bool = np.bool_ # type: ignore @@ -258,7 +258,7 @@ def __init__( model_copies: The exact number of models that you would like loaded onto your machine. This can be useful if you exactly know your CPU or GPU capacity and want to maximize resource utilization. - max_batch_duration_secs: the maximum amount of time to buffer + max_batch_duration_secs: the maximum amount of time to buffer a batch before emitting; used in streaming contexts. kwargs: Additional arguments like 'engine_path' and 'onnx_path' are currently supported. 'env_vars' can be used to set environment variables From 8a6f248c57eff70ac31956fe512a91f7453efeb6 Mon Sep 17 00:00:00 2001 From: Yi Hu Date: Tue, 8 Oct 2024 19:31:11 -0400 Subject: [PATCH 03/14] Enforce a size limit on StringSetData (#32650) * Enforce a size limit on StringSetData * Make StringSetData set mutable. This avoids copy and create new ImutableSet every time * adjust warning log --- .../runners/core/metrics/StringSetCell.java | 9 +- .../runners/core/metrics/StringSetData.java | 102 ++++++++++++--- .../core/metrics/StringSetDataTest.java | 34 ++++- sdks/python/apache_beam/metrics/cells.pxd | 2 +- sdks/python/apache_beam/metrics/cells.py | 116 ++++++++++++++++-- sdks/python/apache_beam/metrics/cells_test.py | 16 ++- sdks/python/apache_beam/metrics/execution.py | 3 +- .../apache_beam/metrics/execution_test.py | 11 +- .../apache_beam/metrics/monitoring_infos.py | 5 +- 9 files changed, 255 insertions(+), 43 deletions(-) diff --git a/runners/core-java/src/main/java/org/apache/beam/runners/core/metrics/StringSetCell.java b/runners/core-java/src/main/java/org/apache/beam/runners/core/metrics/StringSetCell.java index 8455f154c0f8f..fc8dcb49894fe 100644 --- a/runners/core-java/src/main/java/org/apache/beam/runners/core/metrics/StringSetCell.java +++ b/runners/core-java/src/main/java/org/apache/beam/runners/core/metrics/StringSetCell.java @@ -22,7 +22,6 @@ import org.apache.beam.sdk.metrics.MetricName; import org.apache.beam.sdk.metrics.MetricsContainer; import org.apache.beam.sdk.metrics.StringSet; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableSet; import org.checkerframework.checker.nullness.qual.Nullable; /** @@ -101,11 +100,15 @@ public void add(String value) { if (this.setValue.get().stringSet().contains(value)) { return; } - update(StringSetData.create(ImmutableSet.of(value))); + add(new String[] {value}); } @Override public void add(String... values) { - update(StringSetData.create(ImmutableSet.copyOf(values))); + StringSetData original; + do { + original = setValue.get(); + } while (!setValue.compareAndSet(original, original.addAll(values))); + dirty.afterModification(); } } diff --git a/runners/core-java/src/main/java/org/apache/beam/runners/core/metrics/StringSetData.java b/runners/core-java/src/main/java/org/apache/beam/runners/core/metrics/StringSetData.java index 466d4ad46eb6f..4fc5d3beca31e 100644 --- a/runners/core-java/src/main/java/org/apache/beam/runners/core/metrics/StringSetData.java +++ b/runners/core-java/src/main/java/org/apache/beam/runners/core/metrics/StringSetData.java @@ -19,25 +19,49 @@ import com.google.auto.value.AutoValue; import java.io.Serializable; +import java.util.Arrays; +import java.util.HashSet; import java.util.Set; -import java.util.stream.Collectors; -import java.util.stream.StreamSupport; import org.apache.beam.sdk.metrics.StringSetResult; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableSet; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** - * Data describing the StringSet. The {@link StringSetData} hold an immutable copy of the set from - * which it was initially created. This should retain enough detail that it can be combined with - * other {@link StringSetData}. + * Data describing the StringSet. The {@link StringSetData} hold a copy of the set from which it was + * initially created. This should retain enough detail that it can be combined with other {@link + * StringSetData}. + * + *

The underlying set is mutable for {@link #addAll} operation, otherwise a copy set will be + * generated. + * + *

The summation of all string length for a {@code StringSetData} cannot exceed 1 MB. Further + * addition of elements are dropped. */ @AutoValue public abstract class StringSetData implements Serializable { + private static final Logger LOG = LoggerFactory.getLogger(StringSetData.class); + // 1 MB + @VisibleForTesting static final long STRING_SET_SIZE_LIMIT = 1_000_000L; public abstract Set stringSet(); + public abstract long stringSize(); + /** Returns a {@link StringSetData} which is made from an immutable copy of the given set. */ public static StringSetData create(Set set) { - return new AutoValue_StringSetData(ImmutableSet.copyOf(set)); + if (set.isEmpty()) { + return empty(); + } + HashSet combined = new HashSet<>(); + long stringSize = addUntilCapacity(combined, 0L, set); + return new AutoValue_StringSetData(combined, stringSize); + } + + /** Returns a {@link StringSetData} which is made from the given set in place. */ + private static StringSetData createInPlace(HashSet set, long stringSize) { + return new AutoValue_StringSetData(set, stringSize); } /** Return a {@link EmptyStringSetData#INSTANCE} representing an empty {@link StringSetData}. */ @@ -45,6 +69,23 @@ public static StringSetData empty() { return EmptyStringSetData.INSTANCE; } + /** + * Add strings into this {@code StringSetData} and return the result {@code StringSetData}. Reuse + * the original StringSetData's set. As a result, current StringSetData will become invalid. + * + *

>Should only be used by {@link StringSetCell#add}. + */ + public StringSetData addAll(String... strings) { + HashSet combined; + if (this.stringSet() instanceof HashSet) { + combined = (HashSet) this.stringSet(); + } else { + combined = new HashSet<>(this.stringSet()); + } + long stringSize = addUntilCapacity(combined, this.stringSize(), Arrays.asList(strings)); + return StringSetData.createInPlace(combined, stringSize); + } + /** * Combines this {@link StringSetData} with other, both original StringSetData are left intact. */ @@ -54,10 +95,9 @@ public StringSetData combine(StringSetData other) { } else if (other.stringSet().isEmpty()) { return this; } else { - ImmutableSet.Builder combined = ImmutableSet.builder(); - combined.addAll(this.stringSet()); - combined.addAll(other.stringSet()); - return StringSetData.create(combined.build()); + HashSet combined = new HashSet<>(this.stringSet()); + long stringSize = addUntilCapacity(combined, this.stringSize(), other.stringSet()); + return StringSetData.createInPlace(combined, stringSize); } } @@ -65,12 +105,12 @@ public StringSetData combine(StringSetData other) { * Combines this {@link StringSetData} with others, all original StringSetData are left intact. */ public StringSetData combine(Iterable others) { - Set combined = - StreamSupport.stream(others.spliterator(), true) - .flatMap(other -> other.stringSet().stream()) - .collect(Collectors.toSet()); - combined.addAll(this.stringSet()); - return StringSetData.create(combined); + HashSet combined = new HashSet<>(this.stringSet()); + long stringSize = this.stringSize(); + for (StringSetData other : others) { + stringSize = addUntilCapacity(combined, stringSize, other.stringSet()); + } + return StringSetData.createInPlace(combined, stringSize); } /** Returns a {@link StringSetResult} representing this {@link StringSetData}. */ @@ -78,6 +118,31 @@ public StringSetResult extractResult() { return StringSetResult.create(stringSet()); } + /** Add strings into set until reach capacity. Return the all string size of added set. */ + private static long addUntilCapacity( + HashSet combined, long currentSize, Iterable others) { + if (currentSize > STRING_SET_SIZE_LIMIT) { + // already at capacity + return currentSize; + } + for (String string : others) { + if (combined.add(string)) { + currentSize += string.length(); + + // check capacity both before insert and after insert one, so the warning only emit once. + if (currentSize > STRING_SET_SIZE_LIMIT) { + LOG.warn( + "StringSet metrics reaches capacity. Further incoming elements won't be recorded." + + " Current size: {}, last element size: {}.", + currentSize, + string.length()); + break; + } + } + } + return currentSize; + } + /** Empty {@link StringSetData}, representing no values reported and is immutable. */ public static class EmptyStringSetData extends StringSetData { @@ -91,6 +156,11 @@ public Set stringSet() { return ImmutableSet.of(); } + @Override + public long stringSize() { + return 0L; + } + /** Return a {@link StringSetResult#empty()} which is immutable empty set. */ @Override public StringSetResult extractResult() { diff --git a/runners/core-java/src/test/java/org/apache/beam/runners/core/metrics/StringSetDataTest.java b/runners/core-java/src/test/java/org/apache/beam/runners/core/metrics/StringSetDataTest.java index 665ce3743c511..534db203ff3c3 100644 --- a/runners/core-java/src/test/java/org/apache/beam/runners/core/metrics/StringSetDataTest.java +++ b/runners/core-java/src/test/java/org/apache/beam/runners/core/metrics/StringSetDataTest.java @@ -22,6 +22,7 @@ import static org.junit.Assert.assertTrue; import java.util.Collections; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Strings; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableSet; import org.junit.Rule; import org.junit.Test; @@ -81,6 +82,14 @@ public void testStringSetDataEmptyIsImmutable() { assertThrows(UnsupportedOperationException.class, () -> empty.stringSet().add("aa")); } + @Test + public void testStringSetDataEmptyCanAdd() { + ImmutableSet contents = ImmutableSet.of("ab", "cd"); + StringSetData stringSetData = StringSetData.empty(); + stringSetData = stringSetData.addAll(contents.toArray(new String[] {})); + assertEquals(stringSetData.stringSet(), contents); + } + @Test public void testEmptyExtract() { assertTrue(StringSetData.empty().extractResult().getStringSet().isEmpty()); @@ -94,9 +103,26 @@ public void testExtract() { } @Test - public void testExtractReturnsImmutable() { - StringSetData stringSetData = StringSetData.create(ImmutableSet.of("ab", "cd")); - // check that immutable copy is returned - assertThrows(UnsupportedOperationException.class, () -> stringSetData.stringSet().add("aa")); + public void testStringSetAddUntilCapacity() { + StringSetData combined = StringSetData.empty(); + @SuppressWarnings("InlineMeInliner") // Inline representation is Java11+ only + String commonPrefix = Strings.repeat("*", 1000); + long stringSize = 0; + for (int i = 0; i < 1000; ++i) { + String s = commonPrefix + i; + stringSize += s.length(); + combined = combined.addAll(s); + } + assertTrue(combined.stringSize() < stringSize); + assertTrue(combined.stringSize() > StringSetData.STRING_SET_SIZE_LIMIT); + } + + @Test + public void testStringSetAddSizeTrackedCorrectly() { + StringSetData combined = StringSetData.empty(); + combined = combined.addAll("a", "b", "c", "b"); + assertEquals(3, combined.stringSize()); + combined = combined.addAll("c", "d", "e"); + assertEquals(5, combined.stringSize()); } } diff --git a/sdks/python/apache_beam/metrics/cells.pxd b/sdks/python/apache_beam/metrics/cells.pxd index a8f4003d89808..98bb5eff09775 100644 --- a/sdks/python/apache_beam/metrics/cells.pxd +++ b/sdks/python/apache_beam/metrics/cells.pxd @@ -45,7 +45,7 @@ cdef class GaugeCell(MetricCell): cdef class StringSetCell(MetricCell): - cdef readonly set data + cdef readonly object data cdef inline bint _update(self, value) except -1 diff --git a/sdks/python/apache_beam/metrics/cells.py b/sdks/python/apache_beam/metrics/cells.py index 407106342fb81..63fc9f3f7cc9e 100644 --- a/sdks/python/apache_beam/metrics/cells.py +++ b/sdks/python/apache_beam/metrics/cells.py @@ -23,11 +23,14 @@ # pytype: skip-file +import logging import threading import time from datetime import datetime from typing import Any +from typing import Iterable from typing import Optional +from typing import Set from typing import SupportsInt try: @@ -47,6 +50,8 @@ class fake_cython: 'GaugeResult' ] +_LOGGER = logging.getLogger(__name__) + class MetricCell(object): """For internal use only; no backwards-compatibility guarantees. @@ -297,9 +302,9 @@ def _update(self, value): self.data.add(value) def get_cumulative(self): - # type: () -> set + # type: () -> StringSetData with self._lock: - return set(self.data) + return self.data.get_cumulative() def combine(self, other): # type: (StringSetCell) -> StringSetCell @@ -522,6 +527,98 @@ def singleton(value): return DistributionData(value, 1, value, value) +class StringSetData(object): + """For internal use only; no backwards-compatibility guarantees. + + The data structure that holds data about a StringSet metric. + + StringSet metrics are restricted to set of strings only. + + This object is not thread safe, so it's not supposed to be modified + by other than the StringSetCell that contains it. + + The summation of all string length for a StringSetData cannot exceed 1 MB. + Further addition of elements are dropped. + """ + + _STRING_SET_SIZE_LIMIT = 1_000_000 + + def __init__(self, string_set: Optional[Set] = None, string_size: int = 0): + self.string_set = string_set or set() + if not string_size: + string_size = 0 + for s in self.string_set: + string_size += len(s) + self.string_size = string_size + + def __eq__(self, other: object) -> bool: + if isinstance(other, StringSetData): + return ( + self.string_size == other.string_size and + self.string_set == other.string_set) + else: + return False + + def __hash__(self) -> int: + return hash(self.string_set) + + def __repr__(self) -> str: + return 'StringSetData{}:{}'.format(self.string_set, self.string_size) + + def get_cumulative(self) -> "StringSetData": + return StringSetData(set(self.string_set), self.string_size) + + def add(self, *strings): + """ + Add strings into this StringSetData and return the result StringSetData. + Reuse the original StringSetData's set. + """ + self.string_size = self.add_until_capacity( + self.string_set, self.string_size, strings) + return self + + def combine(self, other: "StringSetData") -> "StringSetData": + """ + Combines this StringSetData with other, both original StringSetData are left + intact. + """ + if other is None: + return self + + combined = set(self.string_set) + string_size = self.add_until_capacity( + combined, self.string_size, other.string_set) + return StringSetData(combined, string_size) + + @classmethod + def add_until_capacity( + cls, combined: set, current_size: int, others: Iterable[str]): + """ + Add strings into set until reach capacity. Return the all string size of + added set. + """ + if current_size > cls._STRING_SET_SIZE_LIMIT: + return current_size + + for string in others: + if string not in combined: + combined.add(string) + current_size += len(string) + if current_size > cls._STRING_SET_SIZE_LIMIT: + _LOGGER.warning( + "StringSet metrics reaches capacity. Further incoming elements " + "won't be recorded. Current size: %d, last element size: %d.", + current_size, + len(string)) + break + return current_size + + @staticmethod + def singleton(value): + # type: (int) -> DistributionData + return DistributionData(value, 1, value, value) + + class MetricAggregator(object): """For internal use only; no backwards-compatibility guarantees. @@ -612,17 +709,18 @@ def result(self, x): class StringSetAggregator(MetricAggregator): @staticmethod def identity_element(): - # type: () -> set - return set() + # type: () -> StringSetData + return StringSetData() def combine(self, x, y): - # type: (set, set) -> set - if len(x) == 0: + # type: (StringSetData, StringSetData) -> StringSetData + if len(x.string_set) == 0: return y - elif len(y) == 0: + elif len(y.string_set) == 0: return x else: - return set.union(x, y) + return x.combine(y) def result(self, x): - return x + # type: (StringSetData) -> set + return set(x.string_set) diff --git a/sdks/python/apache_beam/metrics/cells_test.py b/sdks/python/apache_beam/metrics/cells_test.py index 052ff051bf964..d1ee37b8ed820 100644 --- a/sdks/python/apache_beam/metrics/cells_test.py +++ b/sdks/python/apache_beam/metrics/cells_test.py @@ -26,6 +26,7 @@ from apache_beam.metrics.cells import GaugeCell from apache_beam.metrics.cells import GaugeData from apache_beam.metrics.cells import StringSetCell +from apache_beam.metrics.cells import StringSetData from apache_beam.metrics.metricbase import MetricName @@ -176,9 +177,9 @@ def test_not_leak_mutable_set(self): c.add('test') c.add('another') s = c.get_cumulative() - self.assertEqual(s, set(('test', 'another'))) + self.assertEqual(s, StringSetData({'test', 'another'}, 11)) s.add('yet another') - self.assertEqual(c.get_cumulative(), set(('test', 'another'))) + self.assertEqual(c.get_cumulative(), StringSetData({'test', 'another'}, 11)) def test_combine_appropriately(self): s1 = StringSetCell() @@ -190,7 +191,16 @@ def test_combine_appropriately(self): s2.add('3') result = s2.combine(s1) - self.assertEqual(result.data, set(('1', '2', '3'))) + self.assertEqual(result.data, StringSetData({'1', '2', '3'})) + + def test_add_size_tracked_correctly(self): + s = StringSetCell() + s.add('1') + s.add('2') + self.assertEqual(s.data.string_size, 2) + s.add('2') + s.add('3') + self.assertEqual(s.data.string_size, 3) if __name__ == '__main__': diff --git a/sdks/python/apache_beam/metrics/execution.py b/sdks/python/apache_beam/metrics/execution.py index 37007add91638..fa70d3a4d9c01 100644 --- a/sdks/python/apache_beam/metrics/execution.py +++ b/sdks/python/apache_beam/metrics/execution.py @@ -47,6 +47,7 @@ from apache_beam.metrics.cells import DistributionCell from apache_beam.metrics.cells import GaugeCell from apache_beam.metrics.cells import StringSetCell +from apache_beam.metrics.cells import StringSetData from apache_beam.runners.worker import statesampler from apache_beam.runners.worker.statesampler import get_current_tracker @@ -356,7 +357,7 @@ def __init__( counters=None, # type: Optional[Dict[MetricKey, int]] distributions=None, # type: Optional[Dict[MetricKey, DistributionData]] gauges=None, # type: Optional[Dict[MetricKey, GaugeData]] - string_sets=None, # type: Optional[Dict[MetricKey, set]] + string_sets=None, # type: Optional[Dict[MetricKey, StringSetData]] ): # type: (...) -> None diff --git a/sdks/python/apache_beam/metrics/execution_test.py b/sdks/python/apache_beam/metrics/execution_test.py index b157aeb20e9ed..38e27f1f3d0c4 100644 --- a/sdks/python/apache_beam/metrics/execution_test.py +++ b/sdks/python/apache_beam/metrics/execution_test.py @@ -110,11 +110,12 @@ def test_get_cumulative_or_updates(self): self.assertEqual( set(all_values), {v.value for _, v in cumulative.gauges.items()}) - self.assertEqual({str(i % 7) - for i in all_values}, - functools.reduce( - set.union, - (v for _, v in cumulative.string_sets.items()))) + self.assertEqual( + {str(i % 7) + for i in all_values}, + functools.reduce( + set.union, + (v.string_set for _, v in cumulative.string_sets.items()))) if __name__ == '__main__': diff --git a/sdks/python/apache_beam/metrics/monitoring_infos.py b/sdks/python/apache_beam/metrics/monitoring_infos.py index a9540f2846adc..09cb350b38268 100644 --- a/sdks/python/apache_beam/metrics/monitoring_infos.py +++ b/sdks/python/apache_beam/metrics/monitoring_infos.py @@ -31,6 +31,7 @@ from apache_beam.metrics.cells import DistributionResult from apache_beam.metrics.cells import GaugeData from apache_beam.metrics.cells import GaugeResult +from apache_beam.metrics.cells import StringSetData from apache_beam.portability import common_urns from apache_beam.portability.api import metrics_pb2 @@ -305,10 +306,12 @@ def user_set_string(namespace, name, metric, ptransform=None): Args: namespace: User-defined namespace of StringSet. name: Name of StringSet. - metric: The set representing the metrics. + metric: The StringSetData representing the metrics. ptransform: The ptransform id used as a label. """ labels = create_labels(ptransform=ptransform, namespace=namespace, name=name) + if isinstance(metric, StringSetData): + metric = metric.string_set if isinstance(metric, set): metric = list(metric) if isinstance(metric, list): From c4b41708ffa68652cbead0a67aae0c0f0f358a3b Mon Sep 17 00:00:00 2001 From: Jeff Kinard Date: Tue, 8 Oct 2024 19:31:35 -0400 Subject: [PATCH 04/14] Force kafka 3.1.2 for expansion-service jar (#32703) Signed-off-by: Jeffrey Kinard --- sdks/java/io/expansion-service/build.gradle | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/sdks/java/io/expansion-service/build.gradle b/sdks/java/io/expansion-service/build.gradle index d7fef3d823324..26a001b6ea240 100644 --- a/sdks/java/io/expansion-service/build.gradle +++ b/sdks/java/io/expansion-service/build.gradle @@ -27,6 +27,12 @@ applyJavaNature( shadowClosure: {}, ) +// TODO(https://github.com/apache/beam/pull/32486/) Use library.java.kafka_clients once >=3.1.0 is set as default +configurations.runtimeClasspath { + // Pin kafka-clients version due to <3.1.0 missing auth callback classes + resolutionStrategy.force 'org.apache.kafka:kafka-clients:3.1.2' +} + shadowJar { mergeServiceFiles() } @@ -52,8 +58,7 @@ dependencies { runtimeOnly ("org.apache.iceberg:iceberg-hive-metastore:1.4.2") runtimeOnly project(path: ":sdks:java:io:iceberg:hive:exec", configuration: "shadow") - // TODO(https://github.com/apache/beam/pull/32486/) Use library.java.kafka_clients once 3.1.2 is set as default - runtimeOnly ("org.apache.kafka:kafka-clients:3.1.2") + runtimeOnly library.java.kafka_clients runtimeOnly library.java.slf4j_jdk14 } From 14793629dfef2547d4d8d454abee0894da18bbb6 Mon Sep 17 00:00:00 2001 From: Yi Hu Date: Tue, 8 Oct 2024 19:51:38 -0400 Subject: [PATCH 05/14] Report File Lineage on directory (#32662) * Report File Lineage on directory * added comments, restore lineage assert in TextIOIT * Report bucket level Lineage for files larger than 100 * fix lint --- .../org/apache/beam/sdk/io/FileBasedSink.java | 16 +++++- .../apache/beam/sdk/io/FileBasedSource.java | 36 +++++++++++- .../org/apache/beam/sdk/io/FileSystem.java | 14 ++++- .../org/apache/beam/sdk/io/FileSystems.java | 29 +++++++++- .../ReadAllViaFileBasedSourceTransform.java | 33 ++++++++++- .../extensions/gcp/storage/GcsFileSystem.java | 12 +++- .../gcp/storage/GcsFileSystemTest.java | 18 ++++++ .../beam/sdk/io/aws/s3/S3FileSystem.java | 12 +++- .../beam/sdk/io/aws/s3/S3FileSystemTest.java | 17 ++++++ .../beam/sdk/io/aws2/s3/S3FileSystem.java | 12 +++- .../beam/sdk/io/aws2/s3/S3FileSystemTest.java | 17 ++++++ .../blobstore/AzureBlobStoreFileSystem.java | 7 ++- .../AzureBlobStoreFileSystemTest.java | 18 ++++++ .../org/apache/beam/sdk/io/text/TextIOIT.java | 13 ++++- .../python/apache_beam/io/aws/s3filesystem.py | 8 ++- .../apache_beam/io/aws/s3filesystem_test.py | 9 +++ .../io/azure/blobstoragefilesystem.py | 9 ++- .../io/azure/blobstoragefilesystem_test.py | 12 ++++ sdks/python/apache_beam/io/filebasedsink.py | 24 +++++++- sdks/python/apache_beam/io/filebasedsource.py | 57 ++++++++++++++++++- sdks/python/apache_beam/io/filesystem.py | 6 +- sdks/python/apache_beam/io/filesystems.py | 26 +++++++-- .../apache_beam/io/gcp/gcsfilesystem.py | 10 +++- .../apache_beam/io/gcp/gcsfilesystem_test.py | 9 +++ 24 files changed, 393 insertions(+), 31 deletions(-) diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/FileBasedSink.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/io/FileBasedSink.java index b7523ee12b56a..7eb04519555b2 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/FileBasedSink.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/io/FileBasedSink.java @@ -687,11 +687,25 @@ protected final List, ResourceId>> finalizeDestinati distinctFilenames.get(finalFilename)); distinctFilenames.put(finalFilename, result); outputFilenames.add(KV.of(result, finalFilename)); - FileSystems.reportSinkLineage(finalFilename); } + reportSinkLineage(outputFilenames); return outputFilenames; } + /** + * Report sink Lineage. Report every file if number of files no more than 100, otherwise only + * report at directory level. + */ + private void reportSinkLineage(List, ResourceId>> outputFilenames) { + if (outputFilenames.size() <= 100) { + for (KV, ResourceId> kv : outputFilenames) { + FileSystems.reportSinkLineage(kv.getValue()); + } + } else { + FileSystems.reportSinkLineage(outputFilenames.get(0).getValue().getCurrentDirectory()); + } + } + private Collection> createMissingEmptyShards( @Nullable DestinationT dest, @Nullable Integer numShards, diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/FileBasedSource.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/io/FileBasedSource.java index 7ddfde441aedc..8d6e52c64a527 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/FileBasedSource.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/io/FileBasedSource.java @@ -26,10 +26,12 @@ import java.nio.channels.ReadableByteChannel; import java.nio.channels.SeekableByteChannel; import java.util.ArrayList; +import java.util.HashSet; import java.util.List; import java.util.ListIterator; import java.util.NoSuchElementException; import java.util.concurrent.atomic.AtomicReference; +import org.apache.beam.sdk.io.FileSystem.LineageLevel; import org.apache.beam.sdk.io.fs.EmptyMatchTreatment; import org.apache.beam.sdk.io.fs.MatchResult; import org.apache.beam.sdk.io.fs.MatchResult.Metadata; @@ -297,9 +299,10 @@ public final List> split( System.currentTimeMillis() - startTime, expandedFiles.size(), splitResults.size()); + + reportSourceLineage(expandedFiles); return splitResults; } else { - FileSystems.reportSourceLineage(getSingleFileMetadata().resourceId()); if (isSplittable()) { @SuppressWarnings("unchecked") List> splits = @@ -315,6 +318,37 @@ public final List> split( } } + /** + * Report source Lineage. Due to the size limit of Beam metrics, report full file name or only dir + * depend on the number of files. + * + *

- Number of files<=100, report full file paths; + * + *

- Number of directory<=100, report directory names (one level up); + * + *

- Otherwise, report top level only. + */ + private static void reportSourceLineage(List expandedFiles) { + if (expandedFiles.size() <= 100) { + for (Metadata metadata : expandedFiles) { + FileSystems.reportSourceLineage(metadata.resourceId()); + } + } else { + HashSet uniqueDirs = new HashSet<>(); + for (Metadata metadata : expandedFiles) { + ResourceId dir = metadata.resourceId().getCurrentDirectory(); + uniqueDirs.add(dir); + if (uniqueDirs.size() > 100) { + FileSystems.reportSourceLineage(dir, LineageLevel.TOP_LEVEL); + return; + } + } + for (ResourceId uniqueDir : uniqueDirs) { + FileSystems.reportSourceLineage(uniqueDir); + } + } + } + /** * Determines whether a file represented by this source is can be split into bundles. * diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/FileSystem.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/io/FileSystem.java index 11314a318b256..73caa7284e986 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/FileSystem.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/io/FileSystem.java @@ -157,10 +157,20 @@ protected abstract void rename( */ protected abstract String getScheme(); + public enum LineageLevel { + FILE, + TOP_LEVEL + } + + /** Report {@link Lineage} metrics for resource id at file level. */ + protected void reportLineage(ResourceIdT resourceId, Lineage lineage) { + reportLineage(resourceId, lineage, LineageLevel.FILE); + } + /** - * Report {@link Lineage} metrics for resource id. + * Report {@link Lineage} metrics for resource id to a given level. * *

Unless override by FileSystem implementations, default to no-op. */ - protected void reportLineage(ResourceIdT unusedId, Lineage unusedLineage) {} + protected void reportLineage(ResourceIdT unusedId, Lineage unusedLineage, LineageLevel level) {} } diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/FileSystems.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/io/FileSystems.java index a4ca9b80dce37..fb25cac6262f9 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/FileSystems.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/io/FileSystems.java @@ -39,6 +39,7 @@ import java.util.regex.Pattern; import javax.annotation.Nonnull; import org.apache.beam.sdk.annotations.Internal; +import org.apache.beam.sdk.io.FileSystem.LineageLevel; import org.apache.beam.sdk.io.fs.CreateOptions; import org.apache.beam.sdk.io.fs.CreateOptions.StandardCreateOptions; import org.apache.beam.sdk.io.fs.EmptyMatchTreatment; @@ -398,12 +399,36 @@ public ResourceId apply(@Nonnull Metadata input) { /** Report source {@link Lineage} metrics for resource id. */ public static void reportSourceLineage(ResourceId resourceId) { - getFileSystemInternal(resourceId.getScheme()).reportLineage(resourceId, Lineage.getSources()); + reportSourceLineage(resourceId, LineageLevel.FILE); } /** Report sink {@link Lineage} metrics for resource id. */ public static void reportSinkLineage(ResourceId resourceId) { - getFileSystemInternal(resourceId.getScheme()).reportLineage(resourceId, Lineage.getSinks()); + reportSinkLineage(resourceId, LineageLevel.FILE); + } + + /** + * Report source {@link Lineage} metrics for resource id at given level. + * + *

Internal API, no backward compatibility guaranteed. + */ + public static void reportSourceLineage(ResourceId resourceId, LineageLevel level) { + reportLineage(resourceId, Lineage.getSources(), level); + } + + /** + * Report source {@link Lineage} metrics for resource id at given level. + * + *

Internal API, no backward compatibility guaranteed. + */ + public static void reportSinkLineage(ResourceId resourceId, LineageLevel level) { + reportLineage(resourceId, Lineage.getSinks(), level); + } + + /** Report {@link Lineage} metrics for resource id at given level to given Lineage container. */ + private static void reportLineage(ResourceId resourceId, Lineage lineage, LineageLevel level) { + FileSystem fileSystem = getFileSystemInternal(resourceId.getScheme()); + fileSystem.reportLineage(resourceId, lineage, level); } private static class FilterResult { diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/ReadAllViaFileBasedSourceTransform.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/io/ReadAllViaFileBasedSourceTransform.java index bbac337f2d0fe..843deb5cab320 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/ReadAllViaFileBasedSourceTransform.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/io/ReadAllViaFileBasedSourceTransform.java @@ -19,7 +19,9 @@ import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; import java.io.IOException; +import java.util.HashSet; import org.apache.beam.sdk.coders.Coder; +import org.apache.beam.sdk.io.FileSystem.LineageLevel; import org.apache.beam.sdk.io.fs.MatchResult; import org.apache.beam.sdk.io.fs.ResourceId; import org.apache.beam.sdk.io.range.OffsetRange; @@ -30,6 +32,7 @@ import org.apache.beam.sdk.transforms.SerializableFunction; import org.apache.beam.sdk.values.KV; import org.apache.beam.sdk.values.PCollection; +import org.checkerframework.checker.nullness.qual.Nullable; public abstract class ReadAllViaFileBasedSourceTransform extends PTransform, PCollection> { @@ -81,6 +84,9 @@ public static class SplitIntoRangesFn extends DoFn> { private final long desiredBundleSizeBytes; + // track unique resourceId met. Access it only inside reportSourceLineage + private transient @Nullable HashSet uniqueIds; + public SplitIntoRangesFn(long desiredBundleSizeBytes) { this.desiredBundleSizeBytes = desiredBundleSizeBytes; } @@ -88,6 +94,7 @@ public SplitIntoRangesFn(long desiredBundleSizeBytes) { @ProcessElement public void process(ProcessContext c) { MatchResult.Metadata metadata = c.element().getMetadata(); + reportSourceLineage(metadata.resourceId()); if (!metadata.isReadSeekEfficient()) { c.output(KV.of(c.element(), new OffsetRange(0, metadata.sizeBytes()))); return; @@ -97,6 +104,31 @@ public void process(ProcessContext c) { c.output(KV.of(c.element(), range)); } } + + /** + * Report source Lineage. Due to the size limit of Beam metrics, report full file name or only + * top level depend on the number of files. + * + *

- Number of files<=100, report full file paths; + * + *

- Otherwise, report top level only. + */ + @SuppressWarnings("nullness") // only called in processElement, guaranteed to be non-null + private void reportSourceLineage(ResourceId resourceId) { + if (uniqueIds == null) { + uniqueIds = new HashSet<>(); + } else if (uniqueIds.isEmpty()) { + // already at capacity + FileSystems.reportSourceLineage(resourceId, LineageLevel.TOP_LEVEL); + return; + } + uniqueIds.add(resourceId); + FileSystems.reportSourceLineage(resourceId, LineageLevel.FILE); + if (uniqueIds.size() >= 100) { + // avoid reference leak + uniqueIds.clear(); + } + } } public abstract static class AbstractReadFileRangesFn @@ -140,7 +172,6 @@ public void process(ProcessContext c) throws IOException { throw e; } } - FileSystems.reportSourceLineage(resourceId); } } } diff --git a/sdks/java/extensions/google-cloud-platform-core/src/main/java/org/apache/beam/sdk/extensions/gcp/storage/GcsFileSystem.java b/sdks/java/extensions/google-cloud-platform-core/src/main/java/org/apache/beam/sdk/extensions/gcp/storage/GcsFileSystem.java index 6332051c0ddc7..32079ebf55a38 100644 --- a/sdks/java/extensions/google-cloud-platform-core/src/main/java/org/apache/beam/sdk/extensions/gcp/storage/GcsFileSystem.java +++ b/sdks/java/extensions/google-cloud-platform-core/src/main/java/org/apache/beam/sdk/extensions/gcp/storage/GcsFileSystem.java @@ -217,9 +217,19 @@ protected String getScheme() { @Override protected void reportLineage(GcsResourceId resourceId, Lineage lineage) { + reportLineage(resourceId, lineage, LineageLevel.FILE); + } + + @Override + protected void reportLineage(GcsResourceId resourceId, Lineage lineage, LineageLevel level) { GcsPath path = resourceId.getGcsPath(); if (!path.getBucket().isEmpty()) { - lineage.add("gcs", ImmutableList.of(path.getBucket(), path.getObject())); + ImmutableList.Builder segments = + ImmutableList.builder().add(path.getBucket()); + if (level != LineageLevel.TOP_LEVEL && !path.getObject().isEmpty()) { + segments.add(path.getObject()); + } + lineage.add("gcs", segments.build()); } else { LOG.warn("Report Lineage on relative path {} is unsupported", path.getObject()); } diff --git a/sdks/java/extensions/google-cloud-platform-core/src/test/java/org/apache/beam/sdk/extensions/gcp/storage/GcsFileSystemTest.java b/sdks/java/extensions/google-cloud-platform-core/src/test/java/org/apache/beam/sdk/extensions/gcp/storage/GcsFileSystemTest.java index 0b79cde1f187d..f2ff7118f95de 100644 --- a/sdks/java/extensions/google-cloud-platform-core/src/test/java/org/apache/beam/sdk/extensions/gcp/storage/GcsFileSystemTest.java +++ b/sdks/java/extensions/google-cloud-platform-core/src/test/java/org/apache/beam/sdk/extensions/gcp/storage/GcsFileSystemTest.java @@ -23,6 +23,9 @@ import static org.mockito.ArgumentMatchers.anyString; import static org.mockito.Matchers.eq; import static org.mockito.Matchers.isNull; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; import com.google.api.services.storage.model.Objects; @@ -38,6 +41,7 @@ import org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath; import org.apache.beam.sdk.io.fs.MatchResult; import org.apache.beam.sdk.io.fs.MatchResult.Status; +import org.apache.beam.sdk.metrics.Lineage; import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.FluentIterable; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; @@ -235,6 +239,20 @@ public void testMatchNonGlobs() throws Exception { contains(toFilenames(matchResults.get(4)).toArray())); } + @Test + public void testReportLineageOnBucket() { + verifyLineage("gs://testbucket", ImmutableList.of("testbucket")); + verifyLineage("gs://testbucket/", ImmutableList.of("testbucket")); + verifyLineage("gs://testbucket/foo/bar.txt", ImmutableList.of("testbucket", "foo/bar.txt")); + } + + private void verifyLineage(String uri, List expected) { + GcsResourceId path = GcsResourceId.fromGcsPath(GcsPath.fromUri(uri)); + Lineage mockLineage = mock(Lineage.class); + gcsFileSystem.reportLineage(path, mockLineage); + verify(mockLineage, times(1)).add("gcs", expected); + } + private StorageObject createStorageObject(String gcsFilename, long fileSize) { GcsPath gcsPath = GcsPath.fromUri(gcsFilename); // Google APIs will use null for empty files. diff --git a/sdks/java/io/amazon-web-services/src/main/java/org/apache/beam/sdk/io/aws/s3/S3FileSystem.java b/sdks/java/io/amazon-web-services/src/main/java/org/apache/beam/sdk/io/aws/s3/S3FileSystem.java index 7ed56efa44bda..75d66c46478a7 100644 --- a/sdks/java/io/amazon-web-services/src/main/java/org/apache/beam/sdk/io/aws/s3/S3FileSystem.java +++ b/sdks/java/io/amazon-web-services/src/main/java/org/apache/beam/sdk/io/aws/s3/S3FileSystem.java @@ -627,7 +627,17 @@ protected S3ResourceId matchNewResource(String singleResourceSpec, boolean isDir @Override protected void reportLineage(S3ResourceId resourceId, Lineage lineage) { - lineage.add("s3", ImmutableList.of(resourceId.getBucket(), resourceId.getKey())); + reportLineage(resourceId, lineage, LineageLevel.FILE); + } + + @Override + protected void reportLineage(S3ResourceId resourceId, Lineage lineage, LineageLevel level) { + ImmutableList.Builder segments = + ImmutableList.builder().add(resourceId.getBucket()); + if (level != LineageLevel.TOP_LEVEL && !resourceId.getKey().isEmpty()) { + segments.add(resourceId.getKey()); + } + lineage.add("s3", segments.build()); } /** diff --git a/sdks/java/io/amazon-web-services/src/test/java/org/apache/beam/sdk/io/aws/s3/S3FileSystemTest.java b/sdks/java/io/amazon-web-services/src/test/java/org/apache/beam/sdk/io/aws/s3/S3FileSystemTest.java index fbef40f4b5c04..db749d7080e2c 100644 --- a/sdks/java/io/amazon-web-services/src/test/java/org/apache/beam/sdk/io/aws/s3/S3FileSystemTest.java +++ b/sdks/java/io/amazon-web-services/src/test/java/org/apache/beam/sdk/io/aws/s3/S3FileSystemTest.java @@ -34,6 +34,7 @@ import static org.mockito.ArgumentMatchers.argThat; import static org.mockito.Matchers.anyObject; import static org.mockito.Matchers.notNull; +import static org.mockito.Mockito.mock; import static org.mockito.Mockito.never; import static org.mockito.Mockito.times; import static org.mockito.Mockito.verify; @@ -74,6 +75,7 @@ import org.apache.beam.sdk.io.aws.options.S3Options; import org.apache.beam.sdk.io.fs.CreateOptions; import org.apache.beam.sdk.io.fs.MatchResult; +import org.apache.beam.sdk.metrics.Lineage; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.junit.AfterClass; import org.junit.BeforeClass; @@ -1209,6 +1211,21 @@ public void testWriteAndReadWithS3Options() throws IOException { open.close(); } + @Test + public void testReportLineageOnBucket() { + verifyLineage("s3://testbucket", ImmutableList.of("testbucket")); + verifyLineage("s3://testbucket/", ImmutableList.of("testbucket")); + verifyLineage("s3://testbucket/foo/bar.txt", ImmutableList.of("testbucket", "foo/bar.txt")); + } + + private void verifyLineage(String uri, List expected) { + S3FileSystem s3FileSystem = buildMockedS3FileSystem(s3Config("mys3"), client); + S3ResourceId path = S3ResourceId.fromUri(uri); + Lineage mockLineage = mock(Lineage.class); + s3FileSystem.reportLineage(path, mockLineage); + verify(mockLineage, times(1)).add("s3", expected); + } + /** A mockito argument matcher to implement equality on GetObjectMetadataRequest. */ private static class GetObjectMetadataRequestMatcher implements ArgumentMatcher { diff --git a/sdks/java/io/amazon-web-services2/src/main/java/org/apache/beam/sdk/io/aws2/s3/S3FileSystem.java b/sdks/java/io/amazon-web-services2/src/main/java/org/apache/beam/sdk/io/aws2/s3/S3FileSystem.java index 384c8c627ee7f..e851f8333d0b2 100644 --- a/sdks/java/io/amazon-web-services2/src/main/java/org/apache/beam/sdk/io/aws2/s3/S3FileSystem.java +++ b/sdks/java/io/amazon-web-services2/src/main/java/org/apache/beam/sdk/io/aws2/s3/S3FileSystem.java @@ -658,7 +658,17 @@ protected S3ResourceId matchNewResource(String singleResourceSpec, boolean isDir @Override protected void reportLineage(S3ResourceId resourceId, Lineage lineage) { - lineage.add("s3", ImmutableList.of(resourceId.getBucket(), resourceId.getKey())); + reportLineage(resourceId, lineage, LineageLevel.FILE); + } + + @Override + protected void reportLineage(S3ResourceId resourceId, Lineage lineage, LineageLevel level) { + ImmutableList.Builder segments = + ImmutableList.builder().add(resourceId.getBucket()); + if (level != LineageLevel.TOP_LEVEL && !resourceId.getKey().isEmpty()) { + segments.add(resourceId.getKey()); + } + lineage.add("s3", segments.build()); } /** diff --git a/sdks/java/io/amazon-web-services2/src/test/java/org/apache/beam/sdk/io/aws2/s3/S3FileSystemTest.java b/sdks/java/io/amazon-web-services2/src/test/java/org/apache/beam/sdk/io/aws2/s3/S3FileSystemTest.java index 423176e52a75f..39995b8b31670 100644 --- a/sdks/java/io/amazon-web-services2/src/test/java/org/apache/beam/sdk/io/aws2/s3/S3FileSystemTest.java +++ b/sdks/java/io/amazon-web-services2/src/test/java/org/apache/beam/sdk/io/aws2/s3/S3FileSystemTest.java @@ -34,6 +34,7 @@ import static org.mockito.ArgumentMatchers.argThat; import static org.mockito.ArgumentMatchers.eq; import static org.mockito.ArgumentMatchers.notNull; +import static org.mockito.Mockito.mock; import static org.mockito.Mockito.never; import static org.mockito.Mockito.times; import static org.mockito.Mockito.verify; @@ -55,6 +56,7 @@ import org.apache.beam.sdk.io.aws2.options.S3Options; import org.apache.beam.sdk.io.fs.CreateOptions; import org.apache.beam.sdk.io.fs.MatchResult; +import org.apache.beam.sdk.metrics.Lineage; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.junit.AfterClass; import org.junit.BeforeClass; @@ -1068,6 +1070,21 @@ public void testWriteAndRead() throws IOException { open.close(); } + @Test + public void testReportLineageOnBucket() { + verifyLineage("s3://testbucket", ImmutableList.of("testbucket")); + verifyLineage("s3://testbucket/", ImmutableList.of("testbucket")); + verifyLineage("s3://testbucket/foo/bar.txt", ImmutableList.of("testbucket", "foo/bar.txt")); + } + + private void verifyLineage(String uri, List expected) { + S3FileSystem s3FileSystem = buildMockedS3FileSystem(s3Config("mys3"), client); + S3ResourceId path = S3ResourceId.fromUri(uri); + Lineage mockLineage = mock(Lineage.class); + s3FileSystem.reportLineage(path, mockLineage); + verify(mockLineage, times(1)).add("s3", expected); + } + /** A mockito argument matcher to implement equality on GetHeadObjectRequest. */ private static class GetHeadObjectRequestMatcher implements ArgumentMatcher { diff --git a/sdks/java/io/azure/src/main/java/org/apache/beam/sdk/io/azure/blobstore/AzureBlobStoreFileSystem.java b/sdks/java/io/azure/src/main/java/org/apache/beam/sdk/io/azure/blobstore/AzureBlobStoreFileSystem.java index 5137eaf9bb2dc..bbb2e22d94ce6 100644 --- a/sdks/java/io/azure/src/main/java/org/apache/beam/sdk/io/azure/blobstore/AzureBlobStoreFileSystem.java +++ b/sdks/java/io/azure/src/main/java/org/apache/beam/sdk/io/azure/blobstore/AzureBlobStoreFileSystem.java @@ -453,7 +453,12 @@ protected AzfsResourceId matchNewResource(String singleResourceSpec, boolean isD @Override protected void reportLineage(AzfsResourceId resourceId, Lineage lineage) { - if (!Strings.isNullOrEmpty(resourceId.getBlob())) { + reportLineage(resourceId, lineage, LineageLevel.FILE); + } + + @Override + protected void reportLineage(AzfsResourceId resourceId, Lineage lineage, LineageLevel level) { + if (level != LineageLevel.TOP_LEVEL && !Strings.isNullOrEmpty(resourceId.getBlob())) { lineage.add( "abs", ImmutableList.of( diff --git a/sdks/java/io/azure/src/test/java/org/apache/beam/sdk/io/azure/blobstore/AzureBlobStoreFileSystemTest.java b/sdks/java/io/azure/src/test/java/org/apache/beam/sdk/io/azure/blobstore/AzureBlobStoreFileSystemTest.java index 545f314688c3c..27a2220c2e447 100644 --- a/sdks/java/io/azure/src/test/java/org/apache/beam/sdk/io/azure/blobstore/AzureBlobStoreFileSystemTest.java +++ b/sdks/java/io/azure/src/test/java/org/apache/beam/sdk/io/azure/blobstore/AzureBlobStoreFileSystemTest.java @@ -25,6 +25,7 @@ import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.anyString; import static org.mockito.Mockito.doAnswer; +import static org.mockito.Mockito.mock; import static org.mockito.Mockito.times; import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; @@ -51,6 +52,7 @@ import org.apache.beam.sdk.io.azure.options.BlobstoreOptions; import org.apache.beam.sdk.io.fs.CreateOptions; import org.apache.beam.sdk.io.fs.MatchResult; +import org.apache.beam.sdk.metrics.Lineage; import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.FluentIterable; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; @@ -338,4 +340,20 @@ public void testMatchNonGlobs() throws Exception { blobContainerClient.delete(); } + + @Test + public void testReportLineageOnBucket() { + verifyLineage("azfs://account/container", ImmutableList.of("account", "container")); + verifyLineage("azfs://account/container/", ImmutableList.of("account", "container")); + verifyLineage( + "azfs://account/container/foo/bar.txt", + ImmutableList.of("account", "container", "foo/bar.txt")); + } + + private void verifyLineage(String uri, List expected) { + AzfsResourceId path = AzfsResourceId.fromUri(uri); + Lineage mockLineage = mock(Lineage.class); + azureBlobStoreFileSystem.reportLineage(path, mockLineage); + verify(mockLineage, times(1)).add("abs", expected); + } } diff --git a/sdks/java/io/file-based-io-tests/src/test/java/org/apache/beam/sdk/io/text/TextIOIT.java b/sdks/java/io/file-based-io-tests/src/test/java/org/apache/beam/sdk/io/text/TextIOIT.java index 859c03ed7750d..ecdde5cbc8fe7 100644 --- a/sdks/java/io/file-based-io-tests/src/test/java/org/apache/beam/sdk/io/text/TextIOIT.java +++ b/sdks/java/io/file-based-io-tests/src/test/java/org/apache/beam/sdk/io/text/TextIOIT.java @@ -154,9 +154,16 @@ public void writeThenReadAll() { PipelineResult result = pipeline.run(); PipelineResult.State pipelineState = result.waitUntilFinish(); - assertEquals( - Lineage.query(result.metrics(), Lineage.Type.SOURCE), - Lineage.query(result.metrics(), Lineage.Type.SINK)); + + Set sources = Lineage.query(result.metrics(), Lineage.Type.SOURCE); + Set sinks = Lineage.query(result.metrics(), Lineage.Type.SINK); + if (numShards <= 100) { + // both should be the full files, if supported by the runner + assertEquals(sources, sinks); + } else { + // if supported by runner, both should be non-empty + assertEquals(sources.isEmpty(), sinks.isEmpty()); + } collectAndPublishMetrics(result); // Fail the test if pipeline failed. diff --git a/sdks/python/apache_beam/io/aws/s3filesystem.py b/sdks/python/apache_beam/io/aws/s3filesystem.py index e181beac4a584..ffbce5893a969 100644 --- a/sdks/python/apache_beam/io/aws/s3filesystem.py +++ b/sdks/python/apache_beam/io/aws/s3filesystem.py @@ -315,10 +315,14 @@ def delete(self, paths): if exceptions: raise BeamIOError("Delete operation failed", exceptions) - def report_lineage(self, path, lineage): + def report_lineage(self, path, lineage, level=None): try: - components = s3io.parse_s3_path(path, get_account=True) + components = s3io.parse_s3_path(path, object_optional=True) except ValueError: # report lineage is fail-safe return + if level == FileSystem.LineageLevel.TOP_LEVEL or \ + (len(components) > 1 and components[-1] == ''): + # bucket only + components = components[:-1] lineage.add('s3', *components) diff --git a/sdks/python/apache_beam/io/aws/s3filesystem_test.py b/sdks/python/apache_beam/io/aws/s3filesystem_test.py index 60e6f319b2c96..87403f482bd25 100644 --- a/sdks/python/apache_beam/io/aws/s3filesystem_test.py +++ b/sdks/python/apache_beam/io/aws/s3filesystem_test.py @@ -265,6 +265,15 @@ def test_rename(self, unused_mock_arg): src_dest_pairs = list(zip(sources, destinations)) s3io_mock.rename_files.assert_called_once_with(src_dest_pairs) + def test_lineage(self): + self._verify_lineage("s3://bucket/", ("bucket", )) + self._verify_lineage("s3://bucket/foo/bar.txt", ("bucket", "foo/bar.txt")) + + def _verify_lineage(self, uri, expected_segments): + lineage_mock = mock.MagicMock() + self.fs.report_lineage(uri, lineage_mock) + lineage_mock.add.assert_called_once_with("s3", *expected_segments) + if __name__ == '__main__': logging.getLogger().setLevel(logging.INFO) diff --git a/sdks/python/apache_beam/io/azure/blobstoragefilesystem.py b/sdks/python/apache_beam/io/azure/blobstoragefilesystem.py index bb56fa09d3703..4495245dc54a3 100644 --- a/sdks/python/apache_beam/io/azure/blobstoragefilesystem.py +++ b/sdks/python/apache_beam/io/azure/blobstoragefilesystem.py @@ -317,10 +317,15 @@ def delete(self, paths): if exceptions: raise BeamIOError("Delete operation failed", exceptions) - def report_lineage(self, path, lineage): + def report_lineage(self, path, lineage, level=None): try: - components = blobstorageio.parse_azfs_path(path, get_account=True) + components = blobstorageio.parse_azfs_path( + path, blob_optional=True, get_account=True) except ValueError: # report lineage is fail-safe return + if level == FileSystem.LineageLevel.TOP_LEVEL \ + or(len(components) > 1 and components[-1] == ''): + # bucket only + components = components[:-1] lineage.add('abs', *components) diff --git a/sdks/python/apache_beam/io/azure/blobstoragefilesystem_test.py b/sdks/python/apache_beam/io/azure/blobstoragefilesystem_test.py index cee459f5b8a20..138fe5f78b20c 100644 --- a/sdks/python/apache_beam/io/azure/blobstoragefilesystem_test.py +++ b/sdks/python/apache_beam/io/azure/blobstoragefilesystem_test.py @@ -320,6 +320,18 @@ def test_rename(self, unused_mock_blobstorageio): src_dest_pairs = list(zip(sources, destinations)) blobstorageio_mock.rename_files.assert_called_once_with(src_dest_pairs) + def test_lineage(self): + self._verify_lineage( + "azfs://storageaccount/container/", ("storageaccount", "container")) + self._verify_lineage( + "azfs://storageaccount/container/foo/bar.txt", + ("storageaccount", "container", "foo/bar.txt")) + + def _verify_lineage(self, uri, expected_segments): + lineage_mock = mock.MagicMock() + self.fs.report_lineage(uri, lineage_mock) + lineage_mock.add.assert_called_once_with("abs", *expected_segments) + if __name__ == '__main__': logging.getLogger().setLevel(logging.INFO) diff --git a/sdks/python/apache_beam/io/filebasedsink.py b/sdks/python/apache_beam/io/filebasedsink.py index c708e117c3a1d..f9d4303c8c785 100644 --- a/sdks/python/apache_beam/io/filebasedsink.py +++ b/sdks/python/apache_beam/io/filebasedsink.py @@ -280,9 +280,31 @@ def _check_state_for_finalize_write(self, writer_results, num_shards): src_files.append(src) dst_files.append(dst) - FileSystems.report_sink_lineage(dst) + + self._report_sink_lineage(dst_glob, dst_files) return src_files, dst_files, delete_files, num_skipped + def _report_sink_lineage(self, dst_glob, dst_files): + """ + Report sink Lineage. Report every file if number of files no more than 100, + otherwise only report at directory level. + """ + if len(dst_files) <= 100: + for dst in dst_files: + FileSystems.report_sink_lineage(dst) + else: + dst = dst_glob + # dst_glob has a wildcard for shard number (see _shard_name_template) + sep = dst_glob.find('*') + if sep > 0: + dst = dst[:sep] + try: + dst, _ = FileSystems.split(dst) + except ValueError: + return # lineage report is fail-safe + + FileSystems.report_sink_lineage(dst) + @check_accessible(['file_path_prefix']) def finalize_write( self, init_result, writer_results, unused_pre_finalize_results): diff --git a/sdks/python/apache_beam/io/filebasedsource.py b/sdks/python/apache_beam/io/filebasedsource.py index efd863810ed75..a02bc6de32c73 100644 --- a/sdks/python/apache_beam/io/filebasedsource.py +++ b/sdks/python/apache_beam/io/filebasedsource.py @@ -39,6 +39,7 @@ from apache_beam.io import range_trackers from apache_beam.io.filesystem import CompressionTypes from apache_beam.io.filesystem import FileMetadata +from apache_beam.io.filesystem import FileSystem from apache_beam.io.filesystems import FileSystems from apache_beam.io.restriction_trackers import OffsetRange from apache_beam.options.value_provider import StaticValueProvider @@ -168,10 +169,38 @@ def _get_concat_source(self) -> concat_source.ConcatSource: min_bundle_size=self._min_bundle_size, splittable=splittable) single_file_sources.append(single_file_source) - FileSystems.report_source_lineage(file_name) + + self._report_source_lineage(files_metadata) self._concat_source = concat_source.ConcatSource(single_file_sources) + return self._concat_source + def _report_source_lineage(self, files_metadata): + """ + Report source Lineage. depend on the number of files, report full file + name, only dir, or only top level + """ + if len(files_metadata) <= 100: + for file_metadata in files_metadata: + FileSystems.report_source_lineage(file_metadata.path) + else: + size_track = set() + for file_metadata in files_metadata: + if len(size_track) >= 100: + FileSystems.report_source_lineage( + file_metadata.path, level=FileSystem.LineageLevel.TOP_LEVEL) + return + + try: + base, _ = FileSystems.split(file_metadata.path) + except ValueError: + pass + else: + size_track.add(base) + + for base in size_track: + FileSystems.report_source_lineage(base) + def open_file(self, file_name): return FileSystems.open( file_name, @@ -343,6 +372,7 @@ def __init__( self._min_bundle_size = min_bundle_size self._splittable = splittable self._compression_type = compression_type + self._size_track = None def process(self, element: Union[str, FileMetadata], *args, **kwargs) -> Iterable[Tuple[FileMetadata, OffsetRange]]: @@ -352,7 +382,8 @@ def process(self, element: Union[str, FileMetadata], *args, match_results = FileSystems.match([element]) metadata_list = match_results[0].metadata_list for metadata in metadata_list: - FileSystems.report_source_lineage(metadata.path) + self._report_source_lineage(metadata.path) + splittable = ( self._splittable and _determine_splittability_from_compression_type( metadata.path, self._compression_type)) @@ -366,6 +397,28 @@ def process(self, element: Union[str, FileMetadata], *args, metadata, OffsetRange(0, range_trackers.OffsetRangeTracker.OFFSET_INFINITY)) + def _report_source_lineage(self, path): + """ + Report source Lineage. Due to the size limit of Beam metrics, report full + file name or only top level depend on the number of files. + + * Number of files<=100, report full file paths; + + * Otherwise, report top level only. + """ + if self._size_track is None: + self._size_track = set() + elif len(self._size_track) == 0: + FileSystems.report_source_lineage( + path, level=FileSystem.LineageLevel.TOP_LEVEL) + return + + self._size_track.add(path) + FileSystems.report_source_lineage(path) + + if len(self._size_track) >= 100: + self._size_track.clear() + class _ReadRange(DoFn): def __init__( diff --git a/sdks/python/apache_beam/io/filesystem.py b/sdks/python/apache_beam/io/filesystem.py index bdc25dcf0fe54..840fdf3309e7b 100644 --- a/sdks/python/apache_beam/io/filesystem.py +++ b/sdks/python/apache_beam/io/filesystem.py @@ -934,7 +934,11 @@ def delete(self, paths): """ raise NotImplementedError - def report_lineage(self, path, unused_lineage): + class LineageLevel: + FILE = 'FILE' + TOP_LEVEL = 'TOP_LEVEL' + + def report_lineage(self, path, unused_lineage, level=None): """ Report Lineage metrics for path. diff --git a/sdks/python/apache_beam/io/filesystems.py b/sdks/python/apache_beam/io/filesystems.py index ccbeac640765c..a32b85332b608 100644 --- a/sdks/python/apache_beam/io/filesystems.py +++ b/sdks/python/apache_beam/io/filesystems.py @@ -391,13 +391,27 @@ def get_chunk_size(path): return filesystem.CHUNK_SIZE @staticmethod - def report_source_lineage(path): - """Report source :class:`~apache_beam.metrics.metric.Lineage`.""" + def report_source_lineage(path, level=None): + """ + Report source :class:`~apache_beam.metrics.metric.Lineage`. + + Args: + path: string path to be reported. + level: the level of file path. default to + :class:`~apache_beam.io.filesystem.FileSystem.Lineage`.FILE. + """ filesystem = FileSystems.get_filesystem(path) - filesystem.report_lineage(path, Lineage.sources()) + filesystem.report_lineage(path, Lineage.sources(), level=level) @staticmethod - def report_sink_lineage(path): - """Report sink :class:`~apache_beam.metrics.metric.Lineage`.""" + def report_sink_lineage(path, level=None): + """ + Report sink :class:`~apache_beam.metrics.metric.Lineage`. + + Args: + path: string path to be reported. + level: the level of file path. default to + :class:`~apache_beam.io.filesystem.FileSystem.Lineage`.FILE. + """ filesystem = FileSystems.get_filesystem(path) - filesystem.report_lineage(path, Lineage.sinks()) + filesystem.report_lineage(path, Lineage.sinks(), level=level) diff --git a/sdks/python/apache_beam/io/gcp/gcsfilesystem.py b/sdks/python/apache_beam/io/gcp/gcsfilesystem.py index 053b02d325a5c..325f70ddfd96d 100644 --- a/sdks/python/apache_beam/io/gcp/gcsfilesystem.py +++ b/sdks/python/apache_beam/io/gcp/gcsfilesystem.py @@ -366,10 +366,14 @@ def delete(self, paths): if exceptions: raise BeamIOError("Delete operation failed", exceptions) - def report_lineage(self, path, lineage): + def report_lineage(self, path, lineage, level=None): try: - bucket, blob = gcsio.parse_gcs_path(path) + components = gcsio.parse_gcs_path(path, object_optional=True) except ValueError: # report lineage is fail-safe return - lineage.add('gcs', bucket, blob) + if level == FileSystem.LineageLevel.TOP_LEVEL \ + or(len(components) > 1 and components[-1] == ''): + # bucket only + components = components[:-1] + lineage.add('gcs', *components) diff --git a/sdks/python/apache_beam/io/gcp/gcsfilesystem_test.py b/sdks/python/apache_beam/io/gcp/gcsfilesystem_test.py index 1206529faf01c..ec7fa94b05fd4 100644 --- a/sdks/python/apache_beam/io/gcp/gcsfilesystem_test.py +++ b/sdks/python/apache_beam/io/gcp/gcsfilesystem_test.py @@ -375,6 +375,15 @@ def test_delete_error(self, mock_gcsio): self.fs.delete(files) gcsio_mock.delete_batch.assert_called() + def test_lineage(self): + self._verify_lineage("gs://bucket/", ("bucket", )) + self._verify_lineage("gs://bucket/foo/bar.txt", ("bucket", "foo/bar.txt")) + + def _verify_lineage(self, uri, expected_segments): + lineage_mock = mock.MagicMock() + self.fs.report_lineage(uri, lineage_mock) + lineage_mock.add.assert_called_once_with("gcs", *expected_segments) + if __name__ == '__main__': logging.getLogger().setLevel(logging.INFO) From 42cad40532afea05d1001c4dc7f00714f2af4e0d Mon Sep 17 00:00:00 2001 From: Arun Pandian Date: Wed, 9 Oct 2024 03:44:15 -0700 Subject: [PATCH 06/14] [Dataflow Streaming] Remove call to Thread.setName and track thread name inside Work. (#32715) Thread.setName is expensive and uses upto 4% cpu on jobs with many keys. --- .../worker/streaming/ActiveWorkState.java | 4 ++- .../dataflow/worker/streaming/Work.java | 9 +++++ .../worker/util/BoundedQueueExecutor.java | 15 -------- .../processing/StreamingWorkScheduler.java | 2 ++ .../worker/util/BoundedQueueExecutorTest.java | 36 ------------------- 5 files changed, 14 insertions(+), 52 deletions(-) diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkState.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkState.java index c80c3a882e528..4607096dd66af 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkState.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/ActiveWorkState.java @@ -338,7 +338,7 @@ synchronized void printActiveWork(PrintWriter writer, Instant now) { ""); writer.println( - ""); + ""); // Use StringBuilder because we are appending in loop. StringBuilder activeWorkStatus = new StringBuilder(); int commitsPendingCount = 0; @@ -364,6 +364,8 @@ synchronized void printActiveWork(PrintWriter writer, Instant now) { activeWorkStatus.append(activeWork.getState()); activeWorkStatus.append("\n"); } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/Work.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/Work.java index e77823602eda7..03d1e1ae469a3 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/Work.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/Work.java @@ -72,6 +72,7 @@ public final class Work implements RefreshableWork { private final String latencyTrackingId; private TimedState currentState; private volatile boolean isFailed; + private volatile String processingThreadName = ""; private Work( WorkItem workItem, @@ -188,6 +189,14 @@ public void setState(State state) { this.currentState = TimedState.create(state, now); } + public String getProcessingThreadName() { + return processingThreadName; + } + + public void setProcessingThreadName(String processingThreadName) { + this.processingThreadName = processingThreadName; + } + @Override public void setFailed() { this.isFailed = true; diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/util/BoundedQueueExecutor.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/util/BoundedQueueExecutor.java index 5e3f293f7d5b6..9286be84ceaa3 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/util/BoundedQueueExecutor.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/util/BoundedQueueExecutor.java @@ -22,8 +22,6 @@ import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; import javax.annotation.concurrent.GuardedBy; -import org.apache.beam.runners.dataflow.worker.streaming.ExecutableWork; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.Monitor; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.Monitor.Guard; @@ -223,18 +221,10 @@ private void executeMonitorHeld(Runnable work, long workBytes) { try { executor.execute( () -> { - String threadName = Thread.currentThread().getName(); try { - if (work instanceof ExecutableWork) { - String workToken = - debugFormattedWorkToken( - ((ExecutableWork) work).work().getWorkItem().getWorkToken()); - Thread.currentThread().setName(threadName + ":" + workToken); - } work.run(); } finally { decrementCounters(workBytes); - Thread.currentThread().setName(threadName); } }); } catch (RuntimeException e) { @@ -244,11 +234,6 @@ private void executeMonitorHeld(Runnable work, long workBytes) { } } - @VisibleForTesting - public static String debugFormattedWorkToken(long workToken) { - return String.format("%016x", workToken); - } - private void decrementCounters(long workBytes) { monitor.enter(); --elementsOutstanding; diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/StreamingWorkScheduler.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/StreamingWorkScheduler.java index 965a29126dc27..9a3e6eb6b099a 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/StreamingWorkScheduler.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/processing/StreamingWorkScheduler.java @@ -225,6 +225,7 @@ private void processWork(ComputationState computationState, Work work) { Windmill.WorkItem workItem = work.getWorkItem(); String computationId = computationState.getComputationId(); ByteString key = workItem.getKey(); + work.setProcessingThreadName(Thread.currentThread().getName()); work.setState(Work.State.PROCESSING); setUpWorkLoggingContext(work.getLatencyTrackingId(), computationId); LOG.debug("Starting processing for {}:\n{}", computationId, work); @@ -288,6 +289,7 @@ private void processWork(ComputationState computationState, Work work) { } resetWorkLoggingContext(work.getLatencyTrackingId()); + work.setProcessingThreadName(""); } } diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/util/BoundedQueueExecutorTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/util/BoundedQueueExecutorTest.java index ad77958837a12..7349252899202 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/util/BoundedQueueExecutorTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/util/BoundedQueueExecutorTest.java @@ -293,40 +293,4 @@ public void testRenderSummaryHtml() { + "Work Queue Bytes: 0/10000000
/n"; assertEquals(expectedSummaryHtml, executor.summaryHtml()); } - - @Test - public void testExecute_updatesThreadNameForExecutableWork() throws InterruptedException { - CountDownLatch waitForWorkExecution = new CountDownLatch(1); - ExecutableWork executableWork = - createWork( - work -> { - assertTrue( - Thread.currentThread() - .getName() - .contains( - BoundedQueueExecutor.debugFormattedWorkToken( - work.getWorkItem().getWorkToken()))); - waitForWorkExecution.countDown(); - }); - executor.execute(executableWork, executableWork.getWorkItem().getSerializedSize()); - waitForWorkExecution.await(); - } - - @Test - public void testForceExecute_updatesThreadNameForExecutableWork() throws InterruptedException { - CountDownLatch waitForWorkExecution = new CountDownLatch(1); - ExecutableWork executableWork = - createWork( - work -> { - assertTrue( - Thread.currentThread() - .getName() - .contains( - BoundedQueueExecutor.debugFormattedWorkToken( - work.getWorkItem().getWorkToken()))); - waitForWorkExecution.countDown(); - }); - executor.forceExecute(executableWork, executableWork.getWorkItem().getSerializedSize()); - waitForWorkExecution.await(); - } } From b781b82842a11e79396fa6177a8944d0f50c68d5 Mon Sep 17 00:00:00 2001 From: DKPHUONG <82434977+DKER2@users.noreply.github.com> Date: Wed, 9 Oct 2024 21:01:42 +0700 Subject: [PATCH 07/14] [Bug] fix fillna function on a single column fail (#32594) * fix bug all arg add as inputs * fix bug for fillna * Revert "fix bug for fillna" This reverts commit 2a5736c8b4af8ffcac6336a79f759f73da67dad1. * fix bug for fillna * add test for fillna a column * add test for fillna a column * add test for fillna a column * revert add test to frames_test * Move test from transforms to frames --- sdks/python/apache_beam/dataframe/frames_test.py | 11 +++++++++++ sdks/python/apache_beam/dataframe/transforms.py | 6 +++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/sdks/python/apache_beam/dataframe/frames_test.py b/sdks/python/apache_beam/dataframe/frames_test.py index 076ab504addec..55d9fc5f4dfbc 100644 --- a/sdks/python/apache_beam/dataframe/frames_test.py +++ b/sdks/python/apache_beam/dataframe/frames_test.py @@ -1025,6 +1025,17 @@ def test_series_fillna_series_as_value(self): self._run_test(lambda df, df2: df.A.fillna(df2.A), df, df2) + def test_dataframe_column_fillna_constant_as_value(self): + from apache_beam.dataframe import convert + from apache_beam.testing.util import assert_that + from apache_beam.testing.util import equal_to + with beam.Pipeline(None) as p: + pcoll = ( + p | beam.Create([1.0, np.nan, -1.0]) | beam.Select(x=lambda x: x)) + df = convert.to_dataframe(pcoll) + df_new = df['x'].fillna(0) + assert_that(convert.to_pcollection(df_new), equal_to([1.0, 0.0, -1.0])) + @unittest.skipIf(PD_VERSION >= (2, 0), 'append removed in Pandas 2.0') def test_append_verify_integrity(self): df1 = pd.DataFrame({'A': range(10), 'B': range(10)}, index=range(10)) diff --git a/sdks/python/apache_beam/dataframe/transforms.py b/sdks/python/apache_beam/dataframe/transforms.py index 852b49c4e2ed6..d0b5be4eb2a9b 100644 --- a/sdks/python/apache_beam/dataframe/transforms.py +++ b/sdks/python/apache_beam/dataframe/transforms.py @@ -395,7 +395,11 @@ def expr_to_stages(expr): if stage is None: # No stage available, compute this expression as part of a new stage. - stage = Stage(expr.args(), expr.requires_partition_by()) + stage = Stage([ + arg for arg in expr.args() + if not isinstance(arg, expressions.ConstantExpression) + ], + expr.requires_partition_by()) for arg in expr.args(): # For each argument, declare that it is also available in # this new stage. From 5f83e1d083d22d6230646348e4385383bc0f952e Mon Sep 17 00:00:00 2001 From: Yi Hu Date: Wed, 9 Oct 2024 10:52:50 -0400 Subject: [PATCH 08/14] Fix assert in TextIOIT (#32717) --- .../src/test/java/org/apache/beam/sdk/io/text/TextIOIT.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/java/io/file-based-io-tests/src/test/java/org/apache/beam/sdk/io/text/TextIOIT.java b/sdks/java/io/file-based-io-tests/src/test/java/org/apache/beam/sdk/io/text/TextIOIT.java index ecdde5cbc8fe7..e50a8aba41624 100644 --- a/sdks/java/io/file-based-io-tests/src/test/java/org/apache/beam/sdk/io/text/TextIOIT.java +++ b/sdks/java/io/file-based-io-tests/src/test/java/org/apache/beam/sdk/io/text/TextIOIT.java @@ -157,7 +157,7 @@ public void writeThenReadAll() { Set sources = Lineage.query(result.metrics(), Lineage.Type.SOURCE); Set sinks = Lineage.query(result.metrics(), Lineage.Type.SINK); - if (numShards <= 100) { + if (numShards != null && numShards <= 100) { // both should be the full files, if supported by the runner assertEquals(sources, sinks); } else { From 6e570d6e5651a7b1ff42cc035db9954776e5c2f2 Mon Sep 17 00:00:00 2001 From: Yi Hu Date: Wed, 9 Oct 2024 10:55:28 -0400 Subject: [PATCH 09/14] Bump to Dataproc 2.2 and Flink 1.17 for load tests (#32632) --- .../beam_LoadTests_Go_CoGBK_Flink_batch.yml | 4 +- .../beam_LoadTests_Go_Combine_Flink_Batch.yml | 4 +- .../beam_LoadTests_Go_GBK_Flink_Batch.yml | 4 +- .../beam_LoadTests_Go_ParDo_Flink_Batch.yml | 4 +- ...eam_LoadTests_Go_SideInput_Flink_Batch.yml | 4 +- ...eam_LoadTests_Python_CoGBK_Flink_Batch.yml | 4 +- ...m_LoadTests_Python_Combine_Flink_Batch.yml | 4 +- ...adTests_Python_Combine_Flink_Streaming.yml | 4 +- .../beam_LoadTests_Python_GBK_Flink_Batch.yml | 4 +- ...eam_LoadTests_Python_ParDo_Flink_Batch.yml | 4 +- ...LoadTests_Python_ParDo_Flink_Streaming.yml | 4 +- .../beam_Publish_Docker_Snapshots.yml | 2 +- .test-infra/dataproc/flink_cluster.sh | 10 +- .../jenkins/CommonTestProperties.groovy | 2 +- .test-infra/jenkins/Flink.groovy | 120 ------------------ 15 files changed, 29 insertions(+), 149 deletions(-) delete mode 100644 .test-infra/jenkins/Flink.groovy diff --git a/.github/workflows/beam_LoadTests_Go_CoGBK_Flink_batch.yml b/.github/workflows/beam_LoadTests_Go_CoGBK_Flink_batch.yml index fae86961ea27b..a2c347ebddb6e 100644 --- a/.github/workflows/beam_LoadTests_Go_CoGBK_Flink_batch.yml +++ b/.github/workflows/beam_LoadTests_Go_CoGBK_Flink_batch.yml @@ -50,12 +50,12 @@ env: GCLOUD_ZONE: us-central1-a CLUSTER_NAME: beam-loadtests-go-cogbk-flink-batch-${{ github.run_id }} GCS_BUCKET: gs://beam-flink-cluster - FLINK_DOWNLOAD_URL: https://archive.apache.org/dist/flink/flink-1.15.0/flink-1.15.0-bin-scala_2.12.tgz + FLINK_DOWNLOAD_URL: https://archive.apache.org/dist/flink/flink-1.17.0/flink-1.17.0-bin-scala_2.12.tgz HADOOP_DOWNLOAD_URL: https://repo.maven.apache.org/maven2/org/apache/flink/flink-shaded-hadoop-2-uber/2.8.3-10.0/flink-shaded-hadoop-2-uber-2.8.3-10.0.jar FLINK_TASKMANAGER_SLOTS: 1 DETACHED_MODE: true HARNESS_IMAGES_TO_PULL: gcr.io/apache-beam-testing/beam-sdk/beam_go_sdk:latest - JOB_SERVER_IMAGE: gcr.io/apache-beam-testing/beam_portability/beam_flink1.15_job_server:latest + JOB_SERVER_IMAGE: gcr.io/apache-beam-testing/beam_portability/beam_flink1.17_job_server:latest ARTIFACTS_DIR: gs://beam-flink-cluster/beam-loadtests-go-cogbk-flink-batch-${{ github.run_id }} jobs: diff --git a/.github/workflows/beam_LoadTests_Go_Combine_Flink_Batch.yml b/.github/workflows/beam_LoadTests_Go_Combine_Flink_Batch.yml index e814cc809be24..cdb034edcd272 100644 --- a/.github/workflows/beam_LoadTests_Go_Combine_Flink_Batch.yml +++ b/.github/workflows/beam_LoadTests_Go_Combine_Flink_Batch.yml @@ -50,12 +50,12 @@ env: GCLOUD_ZONE: us-central1-a CLUSTER_NAME: beam-loadtests-go-combine-flink-batch-${{ github.run_id }} GCS_BUCKET: gs://beam-flink-cluster - FLINK_DOWNLOAD_URL: https://archive.apache.org/dist/flink/flink-1.15.0/flink-1.15.0-bin-scala_2.12.tgz + FLINK_DOWNLOAD_URL: https://archive.apache.org/dist/flink/flink-1.17.0/flink-1.17.0-bin-scala_2.12.tgz HADOOP_DOWNLOAD_URL: https://repo.maven.apache.org/maven2/org/apache/flink/flink-shaded-hadoop-2-uber/2.8.3-10.0/flink-shaded-hadoop-2-uber-2.8.3-10.0.jar FLINK_TASKMANAGER_SLOTS: 1 DETACHED_MODE: true HARNESS_IMAGES_TO_PULL: gcr.io/apache-beam-testing/beam-sdk/beam_go_sdk:latest - JOB_SERVER_IMAGE: gcr.io/apache-beam-testing/beam_portability/beam_flink1.15_job_server:latest + JOB_SERVER_IMAGE: gcr.io/apache-beam-testing/beam_portability/beam_flink1.17_job_server:latest ARTIFACTS_DIR: gs://beam-flink-cluster/beam-loadtests-go-combine-flink-batch-${{ github.run_id }} jobs: diff --git a/.github/workflows/beam_LoadTests_Go_GBK_Flink_Batch.yml b/.github/workflows/beam_LoadTests_Go_GBK_Flink_Batch.yml index 8c01bc1cf3049..f95e1c831da70 100644 --- a/.github/workflows/beam_LoadTests_Go_GBK_Flink_Batch.yml +++ b/.github/workflows/beam_LoadTests_Go_GBK_Flink_Batch.yml @@ -50,12 +50,12 @@ env: GCLOUD_ZONE: us-central1-a CLUSTER_NAME: beam-loadtests-go-gbk-flink-batch-${{ github.run_id }} GCS_BUCKET: gs://beam-flink-cluster - FLINK_DOWNLOAD_URL: https://archive.apache.org/dist/flink/flink-1.15.0/flink-1.15.0-bin-scala_2.12.tgz + FLINK_DOWNLOAD_URL: https://archive.apache.org/dist/flink/flink-1.17.0/flink-1.17.0-bin-scala_2.12.tgz HADOOP_DOWNLOAD_URL: https://repo.maven.apache.org/maven2/org/apache/flink/flink-shaded-hadoop-2-uber/2.8.3-10.0/flink-shaded-hadoop-2-uber-2.8.3-10.0.jar FLINK_TASKMANAGER_SLOTS: 1 DETACHED_MODE: true HARNESS_IMAGES_TO_PULL: gcr.io/apache-beam-testing/beam-sdk/beam_go_sdk:latest - JOB_SERVER_IMAGE: gcr.io/apache-beam-testing/beam_portability/beam_flink1.15_job_server:latest + JOB_SERVER_IMAGE: gcr.io/apache-beam-testing/beam_portability/beam_flink1.17_job_server:latest ARTIFACTS_DIR: gs://beam-flink-cluster/beam-loadtests-go-gbk-flink-batch-${{ github.run_id }} jobs: diff --git a/.github/workflows/beam_LoadTests_Go_ParDo_Flink_Batch.yml b/.github/workflows/beam_LoadTests_Go_ParDo_Flink_Batch.yml index ba7323a8b63cb..89b31e02261d7 100644 --- a/.github/workflows/beam_LoadTests_Go_ParDo_Flink_Batch.yml +++ b/.github/workflows/beam_LoadTests_Go_ParDo_Flink_Batch.yml @@ -50,12 +50,12 @@ env: GCLOUD_ZONE: us-central1-a CLUSTER_NAME: beam-loadtests-go-pardo-flink-batch-${{ github.run_id }} GCS_BUCKET: gs://beam-flink-cluster - FLINK_DOWNLOAD_URL: https://archive.apache.org/dist/flink/flink-1.15.0/flink-1.15.0-bin-scala_2.12.tgz + FLINK_DOWNLOAD_URL: https://archive.apache.org/dist/flink/flink-1.17.0/flink-1.17.0-bin-scala_2.12.tgz HADOOP_DOWNLOAD_URL: https://repo.maven.apache.org/maven2/org/apache/flink/flink-shaded-hadoop-2-uber/2.8.3-10.0/flink-shaded-hadoop-2-uber-2.8.3-10.0.jar FLINK_TASKMANAGER_SLOTS: 1 DETACHED_MODE: true HARNESS_IMAGES_TO_PULL: gcr.io/apache-beam-testing/beam-sdk/beam_go_sdk:latest - JOB_SERVER_IMAGE: gcr.io/apache-beam-testing/beam_portability/beam_flink1.15_job_server:latest + JOB_SERVER_IMAGE: gcr.io/apache-beam-testing/beam_portability/beam_flink1.17_job_server:latest ARTIFACTS_DIR: gs://beam-flink-cluster/beam-loadtests-go-pardo-flink-batch-${{ github.run_id }} jobs: diff --git a/.github/workflows/beam_LoadTests_Go_SideInput_Flink_Batch.yml b/.github/workflows/beam_LoadTests_Go_SideInput_Flink_Batch.yml index 5440ce968898e..7ab3d837721bb 100644 --- a/.github/workflows/beam_LoadTests_Go_SideInput_Flink_Batch.yml +++ b/.github/workflows/beam_LoadTests_Go_SideInput_Flink_Batch.yml @@ -50,12 +50,12 @@ env: GCLOUD_ZONE: us-central1-a CLUSTER_NAME: beam-loadtests-go-sideinput-flink-batch-${{ github.run_id }} GCS_BUCKET: gs://beam-flink-cluster - FLINK_DOWNLOAD_URL: https://archive.apache.org/dist/flink/flink-1.15.0/flink-1.15.0-bin-scala_2.12.tgz + FLINK_DOWNLOAD_URL: https://archive.apache.org/dist/flink/flink-1.17.0/flink-1.17.0-bin-scala_2.12.tgz HADOOP_DOWNLOAD_URL: https://repo.maven.apache.org/maven2/org/apache/flink/flink-shaded-hadoop-2-uber/2.8.3-10.0/flink-shaded-hadoop-2-uber-2.8.3-10.0.jar FLINK_TASKMANAGER_SLOTS: 1 DETACHED_MODE: true HARNESS_IMAGES_TO_PULL: gcr.io/apache-beam-testing/beam-sdk/beam_go_sdk:latest - JOB_SERVER_IMAGE: gcr.io/apache-beam-testing/beam_portability/beam_flink1.15_job_server:latest + JOB_SERVER_IMAGE: gcr.io/apache-beam-testing/beam_portability/beam_flink1.17_job_server:latest ARTIFACTS_DIR: gs://beam-flink-cluster/beam-loadtests-go-sideinput-flink-batch-${{ github.run_id }} jobs: diff --git a/.github/workflows/beam_LoadTests_Python_CoGBK_Flink_Batch.yml b/.github/workflows/beam_LoadTests_Python_CoGBK_Flink_Batch.yml index e2afb2e2cfd70..9b0dec2249f6f 100644 --- a/.github/workflows/beam_LoadTests_Python_CoGBK_Flink_Batch.yml +++ b/.github/workflows/beam_LoadTests_Python_CoGBK_Flink_Batch.yml @@ -50,12 +50,12 @@ env: GCLOUD_ZONE: us-central1-a CLUSTER_NAME: beam-loadtests-py-cogbk-flink-batch-${{ github.run_id }} GCS_BUCKET: gs://beam-flink-cluster - FLINK_DOWNLOAD_URL: https://archive.apache.org/dist/flink/flink-1.15.0/flink-1.15.0-bin-scala_2.12.tgz + FLINK_DOWNLOAD_URL: https://archive.apache.org/dist/flink/flink-1.17.0/flink-1.17.0-bin-scala_2.12.tgz HADOOP_DOWNLOAD_URL: https://repo.maven.apache.org/maven2/org/apache/flink/flink-shaded-hadoop-2-uber/2.8.3-10.0/flink-shaded-hadoop-2-uber-2.8.3-10.0.jar FLINK_TASKMANAGER_SLOTS: 1 DETACHED_MODE: true HARNESS_IMAGES_TO_PULL: gcr.io/apache-beam-testing/beam-sdk/beam_go_sdk:latest - JOB_SERVER_IMAGE: gcr.io/apache-beam-testing/beam_portability/beam_flink1.15_job_server:latest + JOB_SERVER_IMAGE: gcr.io/apache-beam-testing/beam_portability/beam_flink1.17_job_server:latest ARTIFACTS_DIR: gs://beam-flink-cluster/beam-loadtests-python-cogbk-flink-batch-${{ github.run_id }} jobs: diff --git a/.github/workflows/beam_LoadTests_Python_Combine_Flink_Batch.yml b/.github/workflows/beam_LoadTests_Python_Combine_Flink_Batch.yml index 0f666a0b7db61..6363de044149f 100644 --- a/.github/workflows/beam_LoadTests_Python_Combine_Flink_Batch.yml +++ b/.github/workflows/beam_LoadTests_Python_Combine_Flink_Batch.yml @@ -50,12 +50,12 @@ env: GCLOUD_ZONE: us-central1-a CLUSTER_NAME: beam-loadtests-py-cmb-flink-batch-${{ github.run_id }} GCS_BUCKET: gs://beam-flink-cluster - FLINK_DOWNLOAD_URL: https://archive.apache.org/dist/flink/flink-1.15.0/flink-1.15.0-bin-scala_2.12.tgz + FLINK_DOWNLOAD_URL: https://archive.apache.org/dist/flink/flink-1.17.0/flink-1.17.0-bin-scala_2.12.tgz HADOOP_DOWNLOAD_URL: https://repo.maven.apache.org/maven2/org/apache/flink/flink-shaded-hadoop-2-uber/2.8.3-10.0/flink-shaded-hadoop-2-uber-2.8.3-10.0.jar FLINK_TASKMANAGER_SLOTS: 1 DETACHED_MODE: true HARNESS_IMAGES_TO_PULL: gcr.io/apache-beam-testing/beam-sdk/beam_go_sdk:latest - JOB_SERVER_IMAGE: gcr.io/apache-beam-testing/beam_portability/beam_flink1.15_job_server:latest + JOB_SERVER_IMAGE: gcr.io/apache-beam-testing/beam_portability/beam_flink1.17_job_server:latest ARTIFACTS_DIR: gs://beam-flink-cluster/beam-loadtests-py-cmb-flink-batch-${{ github.run_id }} jobs: diff --git a/.github/workflows/beam_LoadTests_Python_Combine_Flink_Streaming.yml b/.github/workflows/beam_LoadTests_Python_Combine_Flink_Streaming.yml index 6f491e6b9fa98..baf950589c8e8 100644 --- a/.github/workflows/beam_LoadTests_Python_Combine_Flink_Streaming.yml +++ b/.github/workflows/beam_LoadTests_Python_Combine_Flink_Streaming.yml @@ -50,12 +50,12 @@ env: GCLOUD_ZONE: us-central1-a CLUSTER_NAME: beam-loadtests-py-cmb-flink-streaming-${{ github.run_id }} GCS_BUCKET: gs://beam-flink-cluster - FLINK_DOWNLOAD_URL: https://archive.apache.org/dist/flink/flink-1.15.0/flink-1.15.0-bin-scala_2.12.tgz + FLINK_DOWNLOAD_URL: https://archive.apache.org/dist/flink/flink-1.17.0/flink-1.17.0-bin-scala_2.12.tgz HADOOP_DOWNLOAD_URL: https://repo.maven.apache.org/maven2/org/apache/flink/flink-shaded-hadoop-2-uber/2.8.3-10.0/flink-shaded-hadoop-2-uber-2.8.3-10.0.jar FLINK_TASKMANAGER_SLOTS: 1 DETACHED_MODE: true HARNESS_IMAGES_TO_PULL: gcr.io/apache-beam-testing/beam-sdk/beam_go_sdk:latest - JOB_SERVER_IMAGE: gcr.io/apache-beam-testing/beam_portability/beam_flink1.15_job_server:latest + JOB_SERVER_IMAGE: gcr.io/apache-beam-testing/beam_portability/beam_flink1.17_job_server:latest ARTIFACTS_DIR: gs://beam-flink-cluster/beam-loadtests-py-cmb-flink-streaming-${{ github.run_id }} jobs: diff --git a/.github/workflows/beam_LoadTests_Python_GBK_Flink_Batch.yml b/.github/workflows/beam_LoadTests_Python_GBK_Flink_Batch.yml index c938b284a8660..e058852460909 100644 --- a/.github/workflows/beam_LoadTests_Python_GBK_Flink_Batch.yml +++ b/.github/workflows/beam_LoadTests_Python_GBK_Flink_Batch.yml @@ -50,12 +50,12 @@ env: GCLOUD_ZONE: us-central1-a CLUSTER_NAME: beam-loadtests-py-gbk-flk-batch-${{ github.run_id }} GCS_BUCKET: gs://beam-flink-cluster - FLINK_DOWNLOAD_URL: https://archive.apache.org/dist/flink/flink-1.15.0/flink-1.15.0-bin-scala_2.12.tgz + FLINK_DOWNLOAD_URL: https://archive.apache.org/dist/flink/flink-1.17.0/flink-1.17.0-bin-scala_2.12.tgz HADOOP_DOWNLOAD_URL: https://repo.maven.apache.org/maven2/org/apache/flink/flink-shaded-hadoop-2-uber/2.8.3-10.0/flink-shaded-hadoop-2-uber-2.8.3-10.0.jar FLINK_TASKMANAGER_SLOTS: 1 DETACHED_MODE: true HARNESS_IMAGES_TO_PULL: gcr.io/apache-beam-testing/beam-sdk/beam_go_sdk:latest - JOB_SERVER_IMAGE: gcr.io/apache-beam-testing/beam_portability/beam_flink1.15_job_server:latest + JOB_SERVER_IMAGE: gcr.io/apache-beam-testing/beam_portability/beam_flink1.17_job_server:latest ARTIFACTS_DIR: gs://beam-flink-cluster/beam-loadtests-py-gbk-flk-batch-${{ github.run_id }} jobs: diff --git a/.github/workflows/beam_LoadTests_Python_ParDo_Flink_Batch.yml b/.github/workflows/beam_LoadTests_Python_ParDo_Flink_Batch.yml index b6c86e01c2995..8d907cf643bfc 100644 --- a/.github/workflows/beam_LoadTests_Python_ParDo_Flink_Batch.yml +++ b/.github/workflows/beam_LoadTests_Python_ParDo_Flink_Batch.yml @@ -50,12 +50,12 @@ env: GCLOUD_ZONE: us-central1-a CLUSTER_NAME: beam-loadtests-py-pardo-flink-batch-${{ github.run_id }} GCS_BUCKET: gs://beam-flink-cluster - FLINK_DOWNLOAD_URL: https://archive.apache.org/dist/flink/flink-1.15.0/flink-1.15.0-bin-scala_2.12.tgz + FLINK_DOWNLOAD_URL: https://archive.apache.org/dist/flink/flink-1.17.0/flink-1.17.0-bin-scala_2.12.tgz HADOOP_DOWNLOAD_URL: https://repo.maven.apache.org/maven2/org/apache/flink/flink-shaded-hadoop-2-uber/2.8.3-10.0/flink-shaded-hadoop-2-uber-2.8.3-10.0.jar FLINK_TASKMANAGER_SLOTS: 1 DETACHED_MODE: true HARNESS_IMAGES_TO_PULL: gcr.io/apache-beam-testing/beam-sdk/beam_go_sdk:latest - JOB_SERVER_IMAGE: gcr.io/apache-beam-testing/beam_portability/beam_flink1.15_job_server:latest + JOB_SERVER_IMAGE: gcr.io/apache-beam-testing/beam_portability/beam_flink1.17_job_server:latest ARTIFACTS_DIR: gs://beam-flink-cluster/beam-loadtests-python-pardo-flink-batch-${{ github.run_id }} jobs: diff --git a/.github/workflows/beam_LoadTests_Python_ParDo_Flink_Streaming.yml b/.github/workflows/beam_LoadTests_Python_ParDo_Flink_Streaming.yml index a6443c0df10b7..142d1b5e2dc2a 100644 --- a/.github/workflows/beam_LoadTests_Python_ParDo_Flink_Streaming.yml +++ b/.github/workflows/beam_LoadTests_Python_ParDo_Flink_Streaming.yml @@ -50,12 +50,12 @@ env: GCLOUD_ZONE: us-central1-a CLUSTER_NAME: beam-loadtests-py-pardo-flink-stream-${{ github.run_id }} GCS_BUCKET: gs://beam-flink-cluster - FLINK_DOWNLOAD_URL: https://archive.apache.org/dist/flink/flink-1.15.0/flink-1.15.0-bin-scala_2.12.tgz + FLINK_DOWNLOAD_URL: https://archive.apache.org/dist/flink/flink-1.17.0/flink-1.17.0-bin-scala_2.12.tgz HADOOP_DOWNLOAD_URL: https://repo.maven.apache.org/maven2/org/apache/flink/flink-shaded-hadoop-2-uber/2.8.3-10.0/flink-shaded-hadoop-2-uber-2.8.3-10.0.jar FLINK_TASKMANAGER_SLOTS: 1 DETACHED_MODE: true HARNESS_IMAGES_TO_PULL: gcr.io/apache-beam-testing/beam-sdk/beam_go_sdk:latest - JOB_SERVER_IMAGE: gcr.io/apache-beam-testing/beam_portability/beam_flink1.15_job_server:latest + JOB_SERVER_IMAGE: gcr.io/apache-beam-testing/beam_portability/beam_flink1.17_job_server:latest ARTIFACTS_DIR: gs://beam-flink-cluster/beam-loadtests-python-pardo-flink-stream-${{ github.run_id }} jobs: diff --git a/.github/workflows/beam_Publish_Docker_Snapshots.yml b/.github/workflows/beam_Publish_Docker_Snapshots.yml index 334fa537be565..e37a202267c4d 100644 --- a/.github/workflows/beam_Publish_Docker_Snapshots.yml +++ b/.github/workflows/beam_Publish_Docker_Snapshots.yml @@ -83,7 +83,7 @@ jobs: - name: run Publish Docker Snapshots script for Flink uses: ./.github/actions/gradle-command-self-hosted-action with: - gradle-command: :runners:flink:1.15:job-server-container:dockerPush + gradle-command: :runners:flink:1.17:job-server-container:dockerPush arguments: | -Pdocker-repository-root=gcr.io/apache-beam-testing/beam_portability \ -Pdocker-tag-list=latest \ No newline at end of file diff --git a/.test-infra/dataproc/flink_cluster.sh b/.test-infra/dataproc/flink_cluster.sh index b623e890d08f9..759d7a6fcc38b 100755 --- a/.test-infra/dataproc/flink_cluster.sh +++ b/.test-infra/dataproc/flink_cluster.sh @@ -17,7 +17,7 @@ # Provide the following environment to run this script: # # GCLOUD_ZONE: Google cloud zone. Optional. Default: "us-central1-a" -# DATAPROC_VERSION: Dataproc version. Optional. Default: 2.1 +# DATAPROC_VERSION: Dataproc version. Optional. Default: 2.2 # CLUSTER_NAME: Cluster name # GCS_BUCKET: GCS bucket url for Dataproc resources (init actions) # HARNESS_IMAGES_TO_PULL: Urls to SDK Harness' images to pull on dataproc workers (optional: 0, 1 or multiple urls for every harness image) @@ -35,8 +35,8 @@ # HARNESS_IMAGES_TO_PULL='gcr.io//python:latest gcr.io//java:latest' \ # JOB_SERVER_IMAGE=gcr.io//job-server-flink:latest \ # ARTIFACTS_DIR=gs:// \ -# FLINK_DOWNLOAD_URL=https://archive.apache.org/dist/flink/flink-1.12.3/flink-1.12.3-bin-scala_2.11.tgz \ -# HADOOP_DOWNLOAD_URL=https://repo.maven.apache.org/maven2/org/apache/flink/flink-shaded-hadoop-2-uber/2.8.3-9.0/flink-shaded-hadoop-2-uber-2.8.3-9.0.jar \ +# FLINK_DOWNLOAD_URL=https://archive.apache.org/dist/flink/flink-1.17.0/flink-1.17.0-bin-scala_2.12.tgz \ +# HADOOP_DOWNLOAD_URL=https://repo.maven.apache.org/maven2/org/apache/flink/flink-shaded-hadoop-2-uber/2.8.3-10.0/flink-shaded-hadoop-2-uber-2.8.3-9.0.jar \ # FLINK_NUM_WORKERS=2 \ # FLINK_TASKMANAGER_SLOTS=1 \ # DETACHED_MODE=false \ @@ -46,7 +46,7 @@ set -Eeuxo pipefail # GCloud properties GCLOUD_ZONE="${GCLOUD_ZONE:=us-central1-a}" -DATAPROC_VERSION="${DATAPROC_VERSION:=2.1-debian}" +DATAPROC_VERSION="${DATAPROC_VERSION:=2.2-debian}" GCLOUD_REGION=`echo $GCLOUD_ZONE | sed -E "s/(-[a-z])?$//"` MASTER_NAME="$CLUSTER_NAME-m" @@ -133,7 +133,7 @@ function create_cluster() { # This is why flink init action is invoked last. # TODO(11/11/2022) remove --worker-machine-type and --master-machine-type once N2 CPUs quota relaxed # Dataproc 2.1 uses n2-standard-2 by default but there is N2 CPUs=24 quota limit - gcloud dataproc clusters create $CLUSTER_NAME --region=$GCLOUD_REGION --num-workers=$FLINK_NUM_WORKERS \ + gcloud dataproc clusters create $CLUSTER_NAME --region=$GCLOUD_REGION --num-workers=$FLINK_NUM_WORKERS --public-ip-address \ --master-machine-type=n1-standard-2 --worker-machine-type=n1-standard-2 --metadata "${metadata}", \ --image-version=$image_version --zone=$GCLOUD_ZONE --optional-components=FLINK,DOCKER --quiet } diff --git a/.test-infra/jenkins/CommonTestProperties.groovy b/.test-infra/jenkins/CommonTestProperties.groovy index c6870dea59a10..0670b96ef47c3 100644 --- a/.test-infra/jenkins/CommonTestProperties.groovy +++ b/.test-infra/jenkins/CommonTestProperties.groovy @@ -26,7 +26,7 @@ class CommonTestProperties { } static String getFlinkVersion() { - return "1.15" + return "1.17" } static String getSparkVersion() { diff --git a/.test-infra/jenkins/Flink.groovy b/.test-infra/jenkins/Flink.groovy deleted file mode 100644 index 34f3b60709c0d..0000000000000 --- a/.test-infra/jenkins/Flink.groovy +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -class Flink { - private static final String flinkDownloadUrl = 'https://archive.apache.org/dist/flink/flink-1.15.0/flink-1.15.0-bin-scala_2.12.tgz' - private static final String hadoopDownloadUrl = 'https://repo.maven.apache.org/maven2/org/apache/flink/flink-shaded-hadoop-2-uber/2.8.3-10.0/flink-shaded-hadoop-2-uber-2.8.3-10.0.jar' - private static final String FLINK_DIR = '"$WORKSPACE/src/.test-infra/dataproc"' - private static final String FLINK_SCRIPT = 'flink_cluster.sh' - private def job - private String jobName - - Flink(job, String jobName) { - this.job = job - this.jobName = jobName - } - - /** - * Creates Flink cluster and specifies cleanup steps. - * - * @param sdkHarnessImages - the list of published SDK Harness images tags - * @param workerCount - the initial number of worker nodes - * @param jobServerImage - the Flink job server image tag. If left empty, cluster will be set up without the job server. - * @param slotsPerTaskmanager - the number of slots per Flink task manager - */ - void setUp(List sdkHarnessImages, Integer workerCount, String jobServerImage = '', Integer slotsPerTaskmanager = 1) { - setupFlinkCluster(sdkHarnessImages, workerCount, jobServerImage, slotsPerTaskmanager) - addTeardownFlinkStep() - } - - private void setupFlinkCluster(List sdkHarnessImages, Integer workerCount, String jobServerImage, Integer slotsPerTaskmanager) { - String gcsBucket = 'gs://beam-flink-cluster' - String clusterName = getClusterName() - String artifactsDir = "${gcsBucket}/${clusterName}" - String imagesToPull = sdkHarnessImages.join(' ') - - job.steps { - environmentVariables { - env("GCLOUD_ZONE", "us-central1-a") - env("CLUSTER_NAME", clusterName) - env("GCS_BUCKET", gcsBucket) - env("FLINK_DOWNLOAD_URL", flinkDownloadUrl) - env("HADOOP_DOWNLOAD_URL", hadoopDownloadUrl) - env("FLINK_NUM_WORKERS", workerCount) - env("FLINK_TASKMANAGER_SLOTS", slotsPerTaskmanager) - env("DETACHED_MODE", 'true') - - if(imagesToPull) { - env("HARNESS_IMAGES_TO_PULL", imagesToPull) - } - - if(jobServerImage) { - env("JOB_SERVER_IMAGE", jobServerImage) - env("ARTIFACTS_DIR", artifactsDir) - } - } - - shell('echo Setting up flink cluster') - shell("cd ${FLINK_DIR}; ./${FLINK_SCRIPT} create") - } - } - - /** - * Updates the number of worker nodes in a cluster. - * - * @param workerCount - the new number of worker nodes in the cluster - */ - void scaleCluster(Integer workerCount) { - job.steps { - shell("echo Changing number of workers to ${workerCount}") - environmentVariables { - env("FLINK_NUM_WORKERS", workerCount) - } - shell("cd ${FLINK_DIR}; ./${FLINK_SCRIPT} restart") - } - } - - private GString getClusterName() { - return "${jobName.toLowerCase().replace("_", "-")}-\$BUILD_ID" - } - - private void addTeardownFlinkStep() { - job.publishers { - postBuildScript { - buildSteps { - postBuildStep { - stopOnFailure(false) - results([ - 'SUCCESS', - 'UNSTABLE', - 'FAILURE', - 'NOT_BUILT', - 'ABORTED' - ]) - buildSteps { - shell { - command("cd ${FLINK_DIR}; ./${FLINK_SCRIPT} delete") - } - } - } - } - markBuildUnstable(false) - } - } - } -} From c31d81ca875637f8b586050cb3c80ae3f41a255d Mon Sep 17 00:00:00 2001 From: Yi Hu Date: Wed, 9 Oct 2024 10:58:37 -0400 Subject: [PATCH 10/14] Invoke teardown when DoFn throws in portable runners (#32522) * Invoke teardown when DoFn throws in portable runners * update CHANGES.md * adjusted comment and logging --- .../beam_PostCommit_Java_PVR_Flink_Batch.json | 2 +- ...m_PostCommit_Java_PVR_Flink_Streaming.json | 2 +- .../beam_PostCommit_Java_PVR_Samza.json | 2 +- ..._PostCommit_Java_PVR_Spark3_Streaming.json | 2 +- .../beam_PostCommit_Java_PVR_Spark_Batch.json | 2 +- CHANGES.md | 1 + .../flink/job-server/flink_job_server.gradle | 1 - .../google-cloud-dataflow-java/build.gradle | 2 +- runners/samza/job-server/build.gradle | 3 ++- .../spark/job-server/spark_job_server.gradle | 2 -- .../harness/control/ProcessBundleHandler.java | 19 ++++++++++++++++++- 11 files changed, 27 insertions(+), 11 deletions(-) diff --git a/.github/trigger_files/beam_PostCommit_Java_PVR_Flink_Batch.json b/.github/trigger_files/beam_PostCommit_Java_PVR_Flink_Batch.json index b970762c83970..e3d6056a5de96 100644 --- a/.github/trigger_files/beam_PostCommit_Java_PVR_Flink_Batch.json +++ b/.github/trigger_files/beam_PostCommit_Java_PVR_Flink_Batch.json @@ -1,4 +1,4 @@ { "comment": "Modify this file in a trivial way to cause this test suite to run", - "https://github.com/apache/beam/pull/31156": "noting that PR #31156 should run this test" + "modification": 1 } diff --git a/.github/trigger_files/beam_PostCommit_Java_PVR_Flink_Streaming.json b/.github/trigger_files/beam_PostCommit_Java_PVR_Flink_Streaming.json index b60f5c4cc3c80..e3d6056a5de96 100644 --- a/.github/trigger_files/beam_PostCommit_Java_PVR_Flink_Streaming.json +++ b/.github/trigger_files/beam_PostCommit_Java_PVR_Flink_Streaming.json @@ -1,4 +1,4 @@ { "comment": "Modify this file in a trivial way to cause this test suite to run", - "modification": 0 + "modification": 1 } diff --git a/.github/trigger_files/beam_PostCommit_Java_PVR_Samza.json b/.github/trigger_files/beam_PostCommit_Java_PVR_Samza.json index b60f5c4cc3c80..e3d6056a5de96 100644 --- a/.github/trigger_files/beam_PostCommit_Java_PVR_Samza.json +++ b/.github/trigger_files/beam_PostCommit_Java_PVR_Samza.json @@ -1,4 +1,4 @@ { "comment": "Modify this file in a trivial way to cause this test suite to run", - "modification": 0 + "modification": 1 } diff --git a/.github/trigger_files/beam_PostCommit_Java_PVR_Spark3_Streaming.json b/.github/trigger_files/beam_PostCommit_Java_PVR_Spark3_Streaming.json index b60f5c4cc3c80..e3d6056a5de96 100644 --- a/.github/trigger_files/beam_PostCommit_Java_PVR_Spark3_Streaming.json +++ b/.github/trigger_files/beam_PostCommit_Java_PVR_Spark3_Streaming.json @@ -1,4 +1,4 @@ { "comment": "Modify this file in a trivial way to cause this test suite to run", - "modification": 0 + "modification": 1 } diff --git a/.github/trigger_files/beam_PostCommit_Java_PVR_Spark_Batch.json b/.github/trigger_files/beam_PostCommit_Java_PVR_Spark_Batch.json index b60f5c4cc3c80..e3d6056a5de96 100644 --- a/.github/trigger_files/beam_PostCommit_Java_PVR_Spark_Batch.json +++ b/.github/trigger_files/beam_PostCommit_Java_PVR_Spark_Batch.json @@ -1,4 +1,4 @@ { "comment": "Modify this file in a trivial way to cause this test suite to run", - "modification": 0 + "modification": 1 } diff --git a/CHANGES.md b/CHANGES.md index fcb02d1d996af..b9d5f2c191c91 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -80,6 +80,7 @@ ## Bugfixes * Fixed X (Java/Python) ([#X](https://github.com/apache/beam/issues/X)). +* (Java) Fixed tearDown not invoked when DoFn throws on Portable Runners ([#18592](https://github.com/apache/beam/issues/18592), [#31381](https://github.com/apache/beam/issues/31381)). ## Security Fixes * Fixed (CVE-YYYY-NNNN)[https://www.cve.org/CVERecord?id=CVE-YYYY-NNNN] (Java/Python/Go) ([#X](https://github.com/apache/beam/issues/X)). diff --git a/runners/flink/job-server/flink_job_server.gradle b/runners/flink/job-server/flink_job_server.gradle index 56a58df4fb093..1c610477a4442 100644 --- a/runners/flink/job-server/flink_job_server.gradle +++ b/runners/flink/job-server/flink_job_server.gradle @@ -171,7 +171,6 @@ def portableValidatesRunnerTask(String name, boolean streaming, boolean checkpoi excludeCategories 'org.apache.beam.sdk.testing.UsesCustomWindowMerging' excludeCategories 'org.apache.beam.sdk.testing.UsesFailureMessage' excludeCategories 'org.apache.beam.sdk.testing.UsesGaugeMetrics' - excludeCategories 'org.apache.beam.sdk.testing.UsesParDoLifecycle' excludeCategories 'org.apache.beam.sdk.testing.UsesMapState' excludeCategories 'org.apache.beam.sdk.testing.UsesMultimapState' excludeCategories 'org.apache.beam.sdk.testing.UsesSetState' diff --git a/runners/google-cloud-dataflow-java/build.gradle b/runners/google-cloud-dataflow-java/build.gradle index df2270d3b653f..4906d9cf9cb83 100644 --- a/runners/google-cloud-dataflow-java/build.gradle +++ b/runners/google-cloud-dataflow-java/build.gradle @@ -185,7 +185,7 @@ def commonLegacyExcludeCategories = [ 'org.apache.beam.sdk.testing.UsesGaugeMetrics', 'org.apache.beam.sdk.testing.UsesMultimapState', 'org.apache.beam.sdk.testing.UsesTestStream', - 'org.apache.beam.sdk.testing.UsesParDoLifecycle', + 'org.apache.beam.sdk.testing.UsesParDoLifecycle', // doesn't support remote runner 'org.apache.beam.sdk.testing.UsesMetricsPusher', 'org.apache.beam.sdk.testing.UsesBundleFinalizer', ] diff --git a/runners/samza/job-server/build.gradle b/runners/samza/job-server/build.gradle index f972f376e5c8c..6fc8db98a4f9c 100644 --- a/runners/samza/job-server/build.gradle +++ b/runners/samza/job-server/build.gradle @@ -90,7 +90,6 @@ def portableValidatesRunnerTask(String name, boolean docker) { excludeCategories 'org.apache.beam.sdk.testing.UsesCustomWindowMerging' excludeCategories 'org.apache.beam.sdk.testing.UsesFailureMessage' excludeCategories 'org.apache.beam.sdk.testing.UsesGaugeMetrics' - excludeCategories 'org.apache.beam.sdk.testing.UsesParDoLifecycle' excludeCategories 'org.apache.beam.sdk.testing.UsesMapState' excludeCategories 'org.apache.beam.sdk.testing.UsesMultimapState' excludeCategories 'org.apache.beam.sdk.testing.UsesSetState' @@ -127,6 +126,8 @@ def portableValidatesRunnerTask(String name, boolean docker) { excludeTestsMatching 'org.apache.beam.sdk.transforms.ParDoTest$TimestampTests.testParDoShiftTimestampInvalid' // TODO(https://github.com/apache/beam/issues/21144) excludeTestsMatching 'org.apache.beam.sdk.transforms.ParDoTest$TimestampTests.testParDoShiftTimestampInvalidZeroAllowed' + // TODO(https://github.com/apache/beam/issues/32520) + excludeTestsMatching 'org.apache.beam.sdk.transforms.ParDoLifecycleTest.testTeardownCalledAfterExceptionIn*Stateful' // TODO(https://github.com/apache/beam/issues/21145) excludeTestsMatching 'org.apache.beam.sdk.transforms.DeduplicateTest.testEventTime' // TODO(https://github.com/apache/beam/issues/21146) diff --git a/runners/spark/job-server/spark_job_server.gradle b/runners/spark/job-server/spark_job_server.gradle index 6d2d4b2bafbf6..5ed5f4277bf4b 100644 --- a/runners/spark/job-server/spark_job_server.gradle +++ b/runners/spark/job-server/spark_job_server.gradle @@ -118,7 +118,6 @@ def portableValidatesRunnerTask(String name, boolean streaming, boolean docker, excludeCategories 'org.apache.beam.sdk.testing.UsesFailureMessage' excludeCategories 'org.apache.beam.sdk.testing.UsesGaugeMetrics' excludeCategories 'org.apache.beam.sdk.testing.UsesPerKeyOrderedDelivery' - excludeCategories 'org.apache.beam.sdk.testing.UsesParDoLifecycle' excludeCategories 'org.apache.beam.sdk.testing.UsesMapState' excludeCategories 'org.apache.beam.sdk.testing.UsesSetState' excludeCategories 'org.apache.beam.sdk.testing.UsesOrderedListState' @@ -187,7 +186,6 @@ def portableValidatesRunnerTask(String name, boolean streaming, boolean docker, excludeCategories 'org.apache.beam.sdk.testing.UsesGaugeMetrics' excludeCategories 'org.apache.beam.sdk.testing.UsesPerKeyOrderedDelivery' excludeCategories 'org.apache.beam.sdk.testing.UsesPerKeyOrderInBundle' - excludeCategories 'org.apache.beam.sdk.testing.UsesParDoLifecycle' excludeCategories 'org.apache.beam.sdk.testing.UsesMapState' excludeCategories 'org.apache.beam.sdk.testing.UsesMultimapState' excludeCategories 'org.apache.beam.sdk.testing.UsesSetState' diff --git a/sdks/java/harness/src/main/java/org/apache/beam/fn/harness/control/ProcessBundleHandler.java b/sdks/java/harness/src/main/java/org/apache/beam/fn/harness/control/ProcessBundleHandler.java index 0d520dcf7f5c3..c91d5ba71b89e 100644 --- a/sdks/java/harness/src/main/java/org/apache/beam/fn/harness/control/ProcessBundleHandler.java +++ b/sdks/java/harness/src/main/java/org/apache/beam/fn/harness/control/ProcessBundleHandler.java @@ -596,7 +596,11 @@ public BeamFnApi.InstructionResponse.Builder processBundle(BeamFnApi.Instruction request.getProcessBundle().getProcessBundleDescriptorId(), bundleProcessor); return BeamFnApi.InstructionResponse.newBuilder().setProcessBundle(response); } catch (Exception e) { - // Make sure we clean-up from the active set of bundle processors. + // Make sure we clean up from the active set of bundle processors. + LOG.debug( + "Discard bundleProcessor for {} after exception: {}", + request.getProcessBundle().getProcessBundleDescriptorId(), + e.getMessage()); bundleProcessorCache.discard(bundleProcessor); throw e; } @@ -1168,6 +1172,18 @@ void discard() { if (this.bundleCache != null) { this.bundleCache.clear(); } + // setupFunctions are invoked in createBundleProcessor. Invoke teardownFunction here as the + // BundleProcessor is already removed from cache and won't be re-used. + for (ThrowingRunnable teardownFunction : Lists.reverse(this.getTearDownFunctions())) { + try { + teardownFunction.run(); + } catch (Throwable e) { + LOG.warn( + "Exceptions are thrown from DoFn.teardown method when trying to discard " + + "ProcessBundleHandler", + e); + } + } getMetricsEnvironmentStateForBundle().discard(); for (BeamFnDataOutboundAggregator aggregator : getOutboundAggregators().values()) { aggregator.discard(); @@ -1175,6 +1191,7 @@ void discard() { } } + // this is called in cachedBundleProcessors removal listener void shutdown() { for (ThrowingRunnable tearDownFunction : getTearDownFunctions()) { LOG.debug("Tearing down function {}", tearDownFunction); From 7177baf717dc9ce080885f8c86cd83403ad96e0d Mon Sep 17 00:00:00 2001 From: Shunping Huang Date: Wed, 9 Oct 2024 12:47:17 -0400 Subject: [PATCH 11/14] Support ordered list states in python sdk and fnapi runner (#32326) * Support ordered list state in python sdk and fnapi runner. * Add test to verify integrity of multiple iterators * Add fuzz tests and fix two edge cases. * Add sortedcontainer in package dependency * Code refactoring and add a check for the supported maximum key * regenerate requirments for python images. * Refactor portable runner code for ordered list state * Return continuation tokens in portable runnner for ordered list state * Fix some lints * Apply yapf * Fix lints * Sync base image requirements with master. * Add typing for ordered list state apis. * Add typing to orderedliststate user state. * Fix a typo. * Refactor some code based on the feedback. * Fix lints * Remove the support of int argument type in ordered list state apis. * Fix formats and lints * More lints * Refactor the code to use the continuation token logic. * Fix lints --- .../fn_api_runner/worker_handlers.py | 70 +++- .../runners/worker/bundle_processor.py | 195 ++++++++++- .../runners/worker/bundle_processor_test.py | 314 ++++++++++++++++++ .../apache_beam/transforms/userstate.py | 29 ++ sdks/python/setup.py | 1 + 5 files changed, 604 insertions(+), 5 deletions(-) diff --git a/sdks/python/apache_beam/runners/portability/fn_api_runner/worker_handlers.py b/sdks/python/apache_beam/runners/portability/fn_api_runner/worker_handlers.py index bcfa965c04692..c5423e167026a 100644 --- a/sdks/python/apache_beam/runners/portability/fn_api_runner/worker_handlers.py +++ b/sdks/python/apache_beam/runners/portability/fn_api_runner/worker_handlers.py @@ -48,7 +48,9 @@ from typing import overload import grpc +from sortedcontainers import SortedSet +from apache_beam import coders from apache_beam.io import filesystems from apache_beam.io.filesystems import CompressionTypes from apache_beam.portability import common_urns @@ -959,7 +961,8 @@ class StateServicer(beam_fn_api_pb2_grpc.BeamFnStateServicer, 'multimap_keys_values_side_input', 'iterable_side_input', 'bag_user_state', - 'multimap_user_state' + 'multimap_user_state', + 'ordered_list_user_state' ]) class CopyOnWriteState(object): @@ -1021,6 +1024,8 @@ def __init__(self): self._checkpoint = None # type: Optional[StateServicer.StateType] self._use_continuation_tokens = False self._continuations = {} # type: Dict[bytes, Tuple[bytes, ...]] + self._ordered_list_keys = collections.defaultdict( + SortedSet) # type: DefaultDict[bytes, SortedSet] def checkpoint(self): # type: () -> None @@ -1050,6 +1055,14 @@ def process_instruction_id(self, unused_instruction_id): # type: (Any) -> Iterator yield + def _get_one_interval_key(self, state_key, start): + # type: (beam_fn_api_pb2.StateKey, int) -> bytes + state_key_copy = beam_fn_api_pb2.StateKey() + state_key_copy.CopyFrom(state_key) + state_key_copy.ordered_list_user_state.range.start = start + state_key_copy.ordered_list_user_state.range.end = start + 1 + return self._to_key(state_key_copy) + def get_raw(self, state_key, # type: beam_fn_api_pb2.StateKey continuation_token=None # type: Optional[bytes] @@ -1061,7 +1074,30 @@ def get_raw(self, 'Unknown state type: ' + state_key.WhichOneof('type')) with self._lock: - full_state = self._state[self._to_key(state_key)] + if not continuation_token: + # Compute full_state only when no continuation token is provided. + # If there is continuation token, full_state is already in + # continuation cache. No need to recompute. + full_state = [] # type: List[bytes] + if state_key.WhichOneof('type') == 'ordered_list_user_state': + maybe_start = state_key.ordered_list_user_state.range.start + maybe_end = state_key.ordered_list_user_state.range.end + persistent_state_key = beam_fn_api_pb2.StateKey() + persistent_state_key.CopyFrom(state_key) + persistent_state_key.ordered_list_user_state.ClearField("range") + + available_keys = self._ordered_list_keys[self._to_key( + persistent_state_key)] + + for i in available_keys.irange(maybe_start, + maybe_end, + inclusive=(True, False)): + entries = self._state[self._get_one_interval_key( + persistent_state_key, i)] + full_state.extend(entries) + else: + full_state.extend(self._state[self._to_key(state_key)]) + if self._use_continuation_tokens: # The token is "nonce:index". if not continuation_token: @@ -1087,14 +1123,40 @@ def append_raw( ): # type: (...) -> _Future with self._lock: - self._state[self._to_key(state_key)].append(data) + if state_key.WhichOneof('type') == 'ordered_list_user_state': + coder = coders.TupleCoder([ + coders.VarIntCoder(), + coders.coders.LengthPrefixCoder(coders.BytesCoder()) + ]).get_impl() + + for key, value in coder.decode_all(data): + self._state[self._get_one_interval_key(state_key, key)].append( + coder.encode((key, value))) + self._ordered_list_keys[self._to_key(state_key)].add(key) + else: + self._state[self._to_key(state_key)].append(data) return _Future.done() def clear(self, state_key): # type: (beam_fn_api_pb2.StateKey) -> _Future with self._lock: try: - del self._state[self._to_key(state_key)] + if state_key.WhichOneof('type') == 'ordered_list_user_state': + start = state_key.ordered_list_user_state.range.start + end = state_key.ordered_list_user_state.range.end + persistent_state_key = beam_fn_api_pb2.StateKey() + persistent_state_key.CopyFrom(state_key) + persistent_state_key.ordered_list_user_state.ClearField("range") + available_keys = self._ordered_list_keys[self._to_key( + persistent_state_key)] + + for i in list(available_keys.irange(start, + end, + inclusive=(True, False))): + del self._state[self._get_one_interval_key(persistent_state_key, i)] + available_keys.remove(i) + else: + del self._state[self._to_key(state_key)] except KeyError: # This may happen with the caching layer across bundles. Caching may # skip this storage layer for a blocking_get(key) request. Without diff --git a/sdks/python/apache_beam/runners/worker/bundle_processor.py b/sdks/python/apache_beam/runners/worker/bundle_processor.py index fdb13a03bb946..0f1700f524860 100644 --- a/sdks/python/apache_beam/runners/worker/bundle_processor.py +++ b/sdks/python/apache_beam/runners/worker/bundle_processor.py @@ -19,16 +19,21 @@ # pytype: skip-file +from __future__ import annotations + import base64 import bisect import collections import copy +import heapq +import itertools import json import logging import random import threading from dataclasses import dataclass from dataclasses import field +from itertools import chain from typing import TYPE_CHECKING from typing import Any from typing import Callable @@ -50,6 +55,8 @@ from google.protobuf import duration_pb2 from google.protobuf import timestamp_pb2 +from sortedcontainers import SortedDict +from sortedcontainers import SortedList import apache_beam as beam from apache_beam import coders @@ -104,7 +111,8 @@ FnApiUserRuntimeStateTypes = Union['ReadModifyWriteRuntimeState', 'CombiningValueRuntimeState', 'SynchronousSetRuntimeState', - 'SynchronousBagRuntimeState'] + 'SynchronousBagRuntimeState', + 'SynchronousOrderedListRuntimeState'] DATA_INPUT_URN = 'beam:runner:source:v1' DATA_OUTPUT_URN = 'beam:runner:sink:v1' @@ -704,6 +712,180 @@ def commit(self): to_await.get() +class RangeSet: + """For Internal Use only. A simple range set for ranges of [x,y).""" + def __init__(self) -> None: + # The start points and end points are stored separately in order. + self._sorted_starts = SortedList() + self._sorted_ends = SortedList() + + def add(self, start: int, end: int) -> None: + if start >= end: + return + + # ranges[:min_idx] and ranges[max_idx:] is unaffected by this insertion + # the first range whose end point >= the start of the new range + min_idx = self._sorted_ends.bisect_left(start) + # the first range whose start point > the end point of the new range + max_idx = self._sorted_starts.bisect_right(end) + + if min_idx >= len(self._sorted_starts) or max_idx <= 0: + # the new range is beyond any current ranges + new_start = start + new_end = end + else: + # the new range overlaps with ranges[min_idx:max_idx] + new_start = min(start, self._sorted_starts[min_idx]) + new_end = max(end, self._sorted_ends[max_idx - 1]) + + del self._sorted_starts[min_idx:max_idx] + del self._sorted_ends[min_idx:max_idx] + + self._sorted_starts.add(new_start) + self._sorted_ends.add(new_end) + + def __contains__(self, key: int) -> bool: + idx = self._sorted_starts.bisect_left(key) + return (idx < len(self._sorted_starts) and self._sorted_starts[idx] == key + ) or (idx > 0 and self._sorted_ends[idx - 1] > key) + + def __len__(self) -> int: + assert len(self._sorted_starts) == len(self._sorted_ends) + return len(self._sorted_starts) + + def __iter__(self) -> Iterator[Tuple[int, int]]: + return zip(self._sorted_starts, self._sorted_ends) + + def __str__(self) -> str: + return str(list(zip(self._sorted_starts, self._sorted_ends))) + + +class SynchronousOrderedListRuntimeState(userstate.OrderedListRuntimeState): + RANGE_MIN = -(1 << 63) + RANGE_MAX = (1 << 63) - 1 + TIMESTAMP_RANGE_MIN = timestamp.Timestamp(micros=RANGE_MIN) + TIMESTAMP_RANGE_MAX = timestamp.Timestamp(micros=RANGE_MAX) + + def __init__( + self, + state_handler: sdk_worker.CachingStateHandler, + state_key: beam_fn_api_pb2.StateKey, + value_coder: coders.Coder) -> None: + self._state_handler = state_handler + self._state_key = state_key + self._elem_coder = beam.coders.TupleCoder( + [coders.VarIntCoder(), coders.coders.LengthPrefixCoder(value_coder)]) + self._cleared = False + self._pending_adds = SortedDict() + self._pending_removes = RangeSet() + + def add(self, elem: Tuple[timestamp.Timestamp, Any]) -> None: + assert len(elem) == 2 + key_ts, value = elem + key = key_ts.micros + + if key >= self.RANGE_MAX or key < self.RANGE_MIN: + raise ValueError("key value %d is out of range" % key) + self._pending_adds.setdefault(key, []).append(value) + + def read(self) -> Iterable[Tuple[timestamp.Timestamp, Any]]: + return self.read_range(self.TIMESTAMP_RANGE_MIN, self.TIMESTAMP_RANGE_MAX) + + def read_range( + self, + min_timestamp: timestamp.Timestamp, + limit_timestamp: timestamp.Timestamp + ) -> Iterable[Tuple[timestamp.Timestamp, Any]]: + # convert timestamp to int, as sort keys are stored as int internally. + min_key = min_timestamp.micros + limit_key = limit_timestamp.micros + + keys_to_add = self._pending_adds.irange( + min_key, limit_key, inclusive=(True, False)) + + # use list interpretation here to construct the actual list + # of iterators of the selected range. + local_items = chain.from_iterable([ + itertools.islice( + zip(itertools.cycle([ + k, + ]), self._pending_adds[k]), + len(self._pending_adds[k])) for k in keys_to_add + ]) + + if not self._cleared: + range_query_state_key = beam_fn_api_pb2.StateKey() + range_query_state_key.CopyFrom(self._state_key) + range_query_state_key.ordered_list_user_state.range.start = min_key + range_query_state_key.ordered_list_user_state.range.end = limit_key + + # make a deep copy here because there could be other operations occur in + # the middle of an iteration and change pending_removes + pending_removes_snapshot = copy.deepcopy(self._pending_removes) + persistent_items = filter( + lambda kv: kv[0] not in pending_removes_snapshot, + _StateBackedIterable( + self._state_handler, range_query_state_key, self._elem_coder)) + + return map( + lambda x: (timestamp.Timestamp(micros=x[0]), x[1]), + heapq.merge(persistent_items, local_items)) + + return map(lambda x: (timestamp.Timestamp(micros=x[0]), x[1]), local_items) + + def clear(self) -> None: + self._cleared = True + self._pending_adds = SortedDict() + self._pending_removes = RangeSet() + self._pending_removes.add(self.RANGE_MIN, self.RANGE_MAX) + + def clear_range( + self, + min_timestamp: timestamp.Timestamp, + limit_timestamp: timestamp.Timestamp) -> None: + min_key = min_timestamp.micros + limit_key = limit_timestamp.micros + + # materialize the keys to remove before the actual removal + keys_to_remove = list( + self._pending_adds.irange(min_key, limit_key, inclusive=(True, False))) + for k in keys_to_remove: + del self._pending_adds[k] + + if not self._cleared: + self._pending_removes.add(min_key, limit_key) + + def commit(self) -> None: + futures = [] + if self._pending_removes: + for start, end in self._pending_removes: + range_query_state_key = beam_fn_api_pb2.StateKey() + range_query_state_key.CopyFrom(self._state_key) + range_query_state_key.ordered_list_user_state.range.start = start + range_query_state_key.ordered_list_user_state.range.end = end + futures.append(self._state_handler.clear(range_query_state_key)) + + self._pending_removes = RangeSet() + + if self._pending_adds: + items_to_add = [] + for k in self._pending_adds: + items_to_add.extend(zip(itertools.cycle([ + k, + ]), self._pending_adds[k])) + futures.append( + self._state_handler.extend( + self._state_key, self._elem_coder.get_impl(), items_to_add)) + self._pending_adds = SortedDict() + + if len(futures): + # To commit, we need to wait on every state request futures to complete. + for to_await in futures: + to_await.get() + + self._cleared = False + + class OutputTimer(userstate.BaseTimer): def __init__(self, key, @@ -850,6 +1032,17 @@ def _create_state(self, # State keys are expected in nested encoding format key=self._key_coder.encode_nested(key))), value_coder=state_spec.coder) + elif isinstance(state_spec, userstate.OrderedListStateSpec): + return SynchronousOrderedListRuntimeState( + self._state_handler, + state_key=beam_fn_api_pb2.StateKey( + ordered_list_user_state=beam_fn_api_pb2.StateKey. + OrderedListUserState( + transform_id=self._transform_id, + user_state_id=state_spec.name, + window=self._window_coder.encode(window), + key=self._key_coder.encode_nested(key))), + value_coder=state_spec.coder) else: raise NotImplementedError(state_spec) diff --git a/sdks/python/apache_beam/runners/worker/bundle_processor_test.py b/sdks/python/apache_beam/runners/worker/bundle_processor_test.py index dafb4dbd4bf05..0eb4dd9485fd3 100644 --- a/sdks/python/apache_beam/runners/worker/bundle_processor_test.py +++ b/sdks/python/apache_beam/runners/worker/bundle_processor_test.py @@ -18,24 +18,31 @@ """Unit tests for bundle processing.""" # pytype: skip-file +import random import unittest import apache_beam as beam +from apache_beam.coders import StrUtf8Coder from apache_beam.coders.coders import FastPrimitivesCoder from apache_beam.portability import common_urns from apache_beam.portability.api import beam_fn_api_pb2 from apache_beam.runners import common +from apache_beam.runners.portability.fn_api_runner.worker_handlers import StateServicer from apache_beam.runners.worker import bundle_processor from apache_beam.runners.worker import operations from apache_beam.runners.worker.bundle_processor import BeamTransformFactory from apache_beam.runners.worker.bundle_processor import BundleProcessor from apache_beam.runners.worker.bundle_processor import DataInputOperation from apache_beam.runners.worker.bundle_processor import FnApiUserStateContext +from apache_beam.runners.worker.bundle_processor import SynchronousOrderedListRuntimeState from apache_beam.runners.worker.bundle_processor import TimerInfo from apache_beam.runners.worker.data_plane import SizeBasedBufferingClosableOutputStream from apache_beam.runners.worker.data_sampler import DataSampler +from apache_beam.runners.worker.sdk_worker import GlobalCachingStateHandler +from apache_beam.runners.worker.statecache import StateCache from apache_beam.transforms import userstate from apache_beam.transforms.window import GlobalWindow +from apache_beam.utils import timestamp from apache_beam.utils.windowed_value import WindowedValue @@ -422,5 +429,312 @@ def test_user_modified_sdks_need_to_be_installed_in_runtime_env(self): "beam:version:sdk_base:apache/beam_python3.5_sdk:2.1.0-custom")) +class OrderedListStateTest(unittest.TestCase): + class NoStateCache(StateCache): + def __init__(self): + super().__init__(max_weight=0) + + @staticmethod + def _create_state(window=b"my_window", key=b"my_key", coder=StrUtf8Coder()): + state_handler = GlobalCachingStateHandler( + OrderedListStateTest.NoStateCache(), StateServicer()) + state_key = beam_fn_api_pb2.StateKey( + ordered_list_user_state=beam_fn_api_pb2.StateKey.OrderedListUserState( + window=window, key=key)) + return SynchronousOrderedListRuntimeState(state_handler, state_key, coder) + + def setUp(self): + self.state = self._create_state() + + def test_read_range(self): + T0 = timestamp.Timestamp.of(0) + T1 = timestamp.Timestamp.of(1) + T2 = timestamp.Timestamp.of(2) + T3 = timestamp.Timestamp.of(3) + T4 = timestamp.Timestamp.of(4) + T5 = timestamp.Timestamp.of(5) + T9 = timestamp.Timestamp.of(9) + A1, B1, A4 = [(T1, "a1"), (T1, "b1"), (T4, "a4")] + self.assertEqual([], list(self.state.read_range(T0, T5))) + + self.state.add(A1) + self.assertEqual([A1], list(self.state.read_range(T0, T5))) + + self.state.add(B1) + self.assertEqual([A1, B1], list(self.state.read_range(T0, T5))) + + self.state.add(A4) + self.assertEqual([A1, B1, A4], list(self.state.read_range(T0, T5))) + + self.assertEqual([], list(self.state.read_range(T0, T1))) + self.assertEqual([], list(self.state.read_range(T5, T9))) + self.assertEqual([A1, B1], list(self.state.read_range(T1, T2))) + self.assertEqual([], list(self.state.read_range(T2, T3))) + self.assertEqual([], list(self.state.read_range(T2, T4))) + self.assertEqual([A4], list(self.state.read_range(T4, T5))) + + def test_read(self): + T1 = timestamp.Timestamp.of(1) + T4 = timestamp.Timestamp.of(4) + A1, B1, A4 = [(T1, "a1"), (T1, "b1"), (T4, "a4")] + self.assertEqual([], list(self.state.read())) + + self.state.add(A1) + self.assertEqual([A1], list(self.state.read())) + + self.state.add(A1) + self.assertEqual([A1, A1], list(self.state.read())) + + self.state.add(B1) + self.assertEqual([A1, A1, B1], list(self.state.read())) + + self.state.add(A4) + self.assertEqual([A1, A1, B1, A4], list(self.state.read())) + + def test_clear_range(self): + T0 = timestamp.Timestamp.of(0) + T1 = timestamp.Timestamp.of(1) + T2 = timestamp.Timestamp.of(2) + T3 = timestamp.Timestamp.of(3) + T4 = timestamp.Timestamp.of(4) + T5 = timestamp.Timestamp.of(5) + A1, B1, A4, A5 = [(T1, "a1"), (T1, "b1"), (T4, "a4"), (T5, "a5")] + self.state.clear_range(T0, T1) + self.assertEqual([], list(self.state.read())) + + self.state.add(A1) + self.state.add(B1) + self.state.add(A4) + self.state.add(A5) + self.assertEqual([A1, B1, A4, A5], list(self.state.read())) + + self.state.clear_range(T0, T1) + self.assertEqual([A1, B1, A4, A5], list(self.state.read())) + + self.state.clear_range(T1, T2) + self.assertEqual([A4, A5], list(self.state.read())) + + # no side effect on clearing the same range twice + self.state.clear_range(T1, T2) + self.assertEqual([A4, A5], list(self.state.read())) + + self.state.clear_range(T3, T4) + self.assertEqual([A4, A5], list(self.state.read())) + + self.state.clear_range(T3, T5) + self.assertEqual([A5], list(self.state.read())) + + def test_add_and_clear_range_after_commit(self): + T1 = timestamp.Timestamp.of(1) + T4 = timestamp.Timestamp.of(4) + T5 = timestamp.Timestamp.of(5) + T6 = timestamp.Timestamp.of(6) + A1, B1, C1, A4, A5, A6 = [(T1, "a1"), (T1, "b1"), (T1, "c1"), + (T4, "a4"), (T5, "a5"), (T6, "a6")] + self.state.add(A1) + self.state.add(B1) + self.state.add(A4) + self.state.add(A5) + self.state.clear_range(T4, T5) + self.assertEqual([A1, B1, A5], list(self.state.read())) + + self.state.commit() + self.assertEqual(len(self.state._pending_adds), 0) + self.assertEqual(len(self.state._pending_removes), 0) + self.assertEqual([A1, B1, A5], list(self.state.read())) + + self.state.add(C1) + self.state.add(A6) + self.assertEqual([A1, B1, C1, A5, A6], list(self.state.read())) + + self.state.clear_range(T5, T6) + self.assertEqual([A1, B1, C1, A6], list(self.state.read())) + + self.state.commit() + self.assertEqual(len(self.state._pending_adds), 0) + self.assertEqual(len(self.state._pending_removes), 0) + self.assertEqual([A1, B1, C1, A6], list(self.state.read())) + + def test_clear(self): + T1 = timestamp.Timestamp.of(1) + T4 = timestamp.Timestamp.of(4) + T5 = timestamp.Timestamp.of(5) + T9 = timestamp.Timestamp.of(9) + A1, B1, C1, A4, A5, B5 = [(T1, "a1"), (T1, "b1"), (T1, "c1"), + (T4, "a4"), (T5, "a5"), (T5, "b5")] + self.state.add(A1) + self.state.add(B1) + self.state.add(A4) + self.state.add(A5) + self.state.clear_range(T4, T5) + self.assertEqual([A1, B1, A5], list(self.state.read())) + self.state.commit() + + self.state.add(C1) + self.state.clear_range(T5, T9) + self.assertEqual([A1, B1, C1], list(self.state.read())) + self.state.clear() + self.assertEqual(len(self.state._pending_adds), 0) + self.assertEqual(len(self.state._pending_removes), 1) + + self.state.add(B5) + self.assertEqual([B5], list(self.state.read())) + self.state.commit() + + self.assertEqual(len(self.state._pending_adds), 0) + self.assertEqual(len(self.state._pending_removes), 0) + + self.assertEqual([B5], list(self.state.read())) + + def test_multiple_iterators(self): + T1 = timestamp.Timestamp.of(1) + T3 = timestamp.Timestamp.of(3) + T9 = timestamp.Timestamp.of(9) + A1, B1, A3, B3 = [(T1, "a1"), (T1, "b1"), (T3, "a3"), (T3, "b3")] + self.state.add(A1) + self.state.add(A3) + self.state.commit() + + iter_before_b1 = iter(self.state.read()) + self.assertEqual(A1, next(iter_before_b1)) + + self.state.add(B1) + self.assertEqual(A3, next(iter_before_b1)) + self.assertRaises(StopIteration, lambda: next(iter_before_b1)) + + self.state.add(B3) + iter_before_clear_range = iter(self.state.read()) + self.assertEqual(A1, next(iter_before_clear_range)) + self.state.clear_range(T3, T9) + self.assertEqual(B1, next(iter_before_clear_range)) + self.assertEqual(A3, next(iter_before_clear_range)) + self.assertEqual(B3, next(iter_before_clear_range)) + self.assertRaises(StopIteration, lambda: next(iter_before_clear_range)) + self.assertEqual([A1, B1], list(self.state.read())) + + iter_before_clear = iter(self.state.read()) + self.assertEqual(A1, next(iter_before_clear)) + self.state.clear() + self.assertEqual(B1, next(iter_before_clear)) + self.assertRaises(StopIteration, lambda: next(iter_before_clear)) + + self.assertEqual([], list(self.state.read())) + + def fuzz_test_helper(self, seed=0, lower=0, upper=20): + class NaiveState: + def __init__(self): + self._data = [[] for i in range((upper - lower + 1))] + self._logs = [] + + def add(self, elem): + k, v = elem + k = k.micros + self._data[k - lower].append(v) + self._logs.append("add(%d, %s)" % (k, v)) + + def clear_range(self, lo, hi): + lo = lo.micros + hi = hi.micros + for i in range(lo, hi): + self._data[i - lower] = [] + self._logs.append("clear_range(%d, %d)" % (lo, hi)) + + def clear(self): + for i in range(len(self._data)): + self._data[i] = [] + self._logs.append("clear()") + + def read(self): + self._logs.append("read()") + for i in range(len(self._data)): + for v in self._data[i]: + yield (timestamp.Timestamp(micros=(i + lower)), v) + + random.seed(seed) + + state = self._create_state() + bench_state = NaiveState() + + steps = random.randint(20, 50) + for i in range(steps): + op = random.randint(1, 100) + if 1 <= op < 70: + num = random.randint(lower, upper) + state.add((timestamp.Timestamp(micros=num), "a%d" % num)) + bench_state.add((timestamp.Timestamp(micros=num), "a%d" % num)) + elif 70 <= op < 95: + num1 = random.randint(lower, upper) + num2 = random.randint(lower, upper) + min_time = timestamp.Timestamp(micros=min(num1, num2)) + max_time = timestamp.Timestamp(micros=max(num1, num2)) + state.clear_range(min_time, max_time) + bench_state.clear_range(min_time, max_time) + elif op >= 95: + state.clear() + bench_state.clear() + + op = random.randint(1, 10) + if 1 <= op <= 9: + pass + else: + state.commit() + + a = list(bench_state.read()) + b = list(state.read()) + self.assertEqual( + a, + b, + "Mismatch occurred on seed=%d, step=%d, logs=%s" % + (seed, i, ';'.join(bench_state._logs))) + + def test_fuzz(self): + for _ in range(1000): + seed = random.randint(0, 0xffffffffffffffff) + try: + self.fuzz_test_helper(seed=seed) + except Exception as e: + raise RuntimeError("Exception occurred on seed=%d: %s" % (seed, e)) + + def test_min_max(self): + T_MIN = timestamp.Timestamp(micros=(-(1 << 63))) + T_MAX_MINUS_ONE = timestamp.Timestamp(micros=((1 << 63) - 2)) + T_MAX = timestamp.Timestamp(micros=((1 << 63) - 1)) + T0 = timestamp.Timestamp(micros=0) + INT64_MIN, INT64_MAX_MINUS_ONE, INT64_MAX = [(T_MIN, "min"), + (T_MAX_MINUS_ONE, "max"), + (T_MAX, "err")] + self.state.add(INT64_MIN) + self.state.add(INT64_MAX_MINUS_ONE) + self.assertRaises(ValueError, lambda: self.state.add(INT64_MAX)) + + self.assertEqual([INT64_MIN, INT64_MAX_MINUS_ONE], list(self.state.read())) + self.assertEqual([INT64_MIN], list(self.state.read_range(T_MIN, T0))) + self.assertEqual([INT64_MAX_MINUS_ONE], + list(self.state.read_range(T0, T_MAX))) + + def test_continuation_token(self): + T1 = timestamp.Timestamp.of(1) + T2 = timestamp.Timestamp.of(2) + T7 = timestamp.Timestamp.of(7) + T8 = timestamp.Timestamp.of(8) + A1, A2, A7, B7, A8 = [(T1, "a1"), (T2, "a2"), (T7, "a7"), + (T7, "b7"), (T8, "a8")] + self.state._state_handler._underlying._use_continuation_tokens = True + self.assertEqual([], list(self.state.read_range(T1, T8))) + + self.state.add(A1) + self.state.add(A2) + self.state.add(A7) + self.state.add(B7) + self.state.add(A8) + + self.assertEqual([A2, A7, B7], list(self.state.read_range(T2, T8))) + + self.state.commit() + self.assertEqual([A2, A7, B7], list(self.state.read_range(T2, T8))) + + self.assertEqual([A1, A2, A7, B7, A8], list(self.state.read())) + + if __name__ == '__main__': unittest.main() diff --git a/sdks/python/apache_beam/transforms/userstate.py b/sdks/python/apache_beam/transforms/userstate.py index ada0b755bd6c9..cad7335381111 100644 --- a/sdks/python/apache_beam/transforms/userstate.py +++ b/sdks/python/apache_beam/transforms/userstate.py @@ -150,6 +150,17 @@ def to_runner_api( urn=common_urns.user_state.BAG.urn)) +class OrderedListStateSpec(StateSpec): + """Specification for a user DoFn ordered list state cell.""" + def to_runner_api( + self, context: 'PipelineContext') -> beam_runner_api_pb2.StateSpec: + return beam_runner_api_pb2.StateSpec( + ordered_list_spec=beam_runner_api_pb2.OrderedListStateSpec( + element_coder_id=context.coders.get_id(self.coder)), + protocol=beam_runner_api_pb2.FunctionSpec( + urn=common_urns.user_state.ORDERED_LIST.urn)) + + # TODO(BEAM-9562): Update Timer to have of() and clear() APIs. Timer = NamedTuple( 'Timer', @@ -372,6 +383,24 @@ class CombiningValueRuntimeState(AccumulatingRuntimeState): """Combining value state interface object passed to user code.""" +class OrderedListRuntimeState(AccumulatingRuntimeState): + """Ordered list state interface object passed to user code.""" + def read(self) -> Iterable[Tuple[Timestamp, Any]]: + raise NotImplementedError(type(self)) + + def add(self, value: Tuple[Timestamp, Any]) -> None: + raise NotImplementedError(type(self)) + + def read_range( + self, min_time_stamp: Timestamp, + limit_time_stamp: Timestamp) -> Iterable[Tuple[Timestamp, Any]]: + raise NotImplementedError(type(self)) + + def clear_range( + self, min_time_stamp: Timestamp, limit_time_stamp: Timestamp) -> None: + raise NotImplementedError(type(self)) + + class UserStateContext(object): """Wrapper allowing user state and timers to be accessed by a DoFnInvoker.""" def get_timer( diff --git a/sdks/python/setup.py b/sdks/python/setup.py index c3189e18d2c81..6eb74e9099c18 100644 --- a/sdks/python/setup.py +++ b/sdks/python/setup.py @@ -388,6 +388,7 @@ def get_portability_package_data(): 'redis>=5.0.0,<6', 'regex>=2020.6.8', 'requests>=2.24.0,<3.0.0', + 'sortedcontainers>=2.4.0', 'typing-extensions>=3.7.0', 'zstandard>=0.18.0,<1', # Dynamic dependencies must be specified in a separate list, otherwise From 20d0f6e5a85c6b5738098e13e212337d86f49412 Mon Sep 17 00:00:00 2001 From: Sergei Lilichenko Date: Wed, 9 Oct 2024 09:47:48 -0700 Subject: [PATCH 12/14] Add support for global sequence processing to the "ordered" extension in Java SDK (#32540) * Initial changes to support processing global sequences. * Refactor the DoFns out of the transform and into a class hierarchy. * Next round of implementation of Global Sequence handling. * Added ticker timers in global sequence processing. * Corrected the emission batch logic. * Reworked some tests and fixed the batch output logic. * Pluggable combiner for the global sequence. * First iteration of the efficient merging accumulator * Mostly complete implementation of the accumulator and corresponding tests. * Additional round of test refinements. * Added logic to DQL the records below the global sequence range. * Added providing a global sequence combiner through a handler. * Added SequenceRangeAccumulatorCoder and tests. Improved logic of creating timers. * Fixed logging levels (moved them to "trace") on several transforms. * Round of code improvements and cleanups. * Tests to verify that the the global sequence is correctly produced by the transform. * Added batch processing verification to the global sequence processing. * A round of documentation update and minor clean up. * Fixed the description in CHANGES.md * Polish by "spotless" * Polish by "spotless" * Removed unneeded logging configuration file. * Made ContiguousSequenceRange open ended. * Removed details from 2.60.0 section in CHANGES.md. * Update sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/combiner/DefaultSequenceCombiner.java Co-authored-by: Danny McCormick * Fixed spotless related errors. * Added a note about the new functionality to CHANGES.md * Added clarification around the data structure used in the sequence combiner. * Added clarification around the data structure used in the sequence combiner. * Fixed the problem with allowed lateness being set to 0 in the global sequence tracker. * Parameterized the GlobalSequenceTracker with the max number of events to trigger the re-evaluation. Fixed accidentally disabled unit tests. * Made the event timer used to wait for the event arrival respect the lateness of the input. * Created new failure reason code - "before initial sequence" --------- Co-authored-by: Danny McCormick --- CHANGES.md | 1 + sdks/java/extensions/ordered/build.gradle | 6 + .../ordered/ContiguousSequenceRange.java | 83 +++ .../sdk/extensions/ordered/EventExaminer.java | 9 +- .../ordered/GlobalSequenceTracker.java | 112 +++ .../ordered/GlobalSequencesProcessorDoFn.java | 276 +++++++ .../ordered/OrderedEventProcessor.java | 685 +++++------------- .../ordered/OrderedEventProcessorResult.java | 37 +- .../ordered/OrderedProcessingHandler.java | 80 ++ .../ordered/OrderedProcessingStatus.java | 17 +- .../ordered/PerKeyTickerGenerator.java | 132 ++++ .../extensions/ordered/ProcessingState.java | 87 ++- .../sdk/extensions/ordered/ProcessorDoFn.java | 427 +++++++++++ .../ordered/SequencePerKeyProcessorDoFn.java | 294 ++++++++ .../extensions/ordered/UnprocessedEvent.java | 3 +- .../combiner/DefaultSequenceCombiner.java | 122 ++++ .../combiner/SequenceRangeAccumulator.java | 296 ++++++++ .../ordered/combiner/package-info.java | 23 + .../sdk/extensions/ordered/package-info.java | 4 +- ...deredEventProcessorGlobalSequenceTest.java | 534 ++++++++++++++ ...eredEventProcessorPerKeySequenceTest.java} | 358 ++------- .../OrderedEventProcessorTestBase.java | 395 ++++++++++ .../StringBufferOrderedProcessingHandler.java | 18 + .../SequenceRangeAccumulatorCoderTest.java | 71 ++ .../SequenceRangeAccumulatorTest.java | 400 ++++++++++ 25 files changed, 3639 insertions(+), 831 deletions(-) create mode 100644 sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/ContiguousSequenceRange.java create mode 100644 sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/GlobalSequenceTracker.java create mode 100644 sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/GlobalSequencesProcessorDoFn.java create mode 100644 sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/PerKeyTickerGenerator.java create mode 100644 sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/ProcessorDoFn.java create mode 100644 sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/SequencePerKeyProcessorDoFn.java create mode 100644 sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/combiner/DefaultSequenceCombiner.java create mode 100644 sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/combiner/SequenceRangeAccumulator.java create mode 100644 sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/combiner/package-info.java create mode 100644 sdks/java/extensions/ordered/src/test/java/org/apache/beam/sdk/extensions/ordered/OrderedEventProcessorGlobalSequenceTest.java rename sdks/java/extensions/ordered/src/test/java/org/apache/beam/sdk/extensions/ordered/{OrderedEventProcessorTest.java => OrderedEventProcessorPerKeySequenceTest.java} (71%) create mode 100644 sdks/java/extensions/ordered/src/test/java/org/apache/beam/sdk/extensions/ordered/OrderedEventProcessorTestBase.java create mode 100644 sdks/java/extensions/ordered/src/test/java/org/apache/beam/sdk/extensions/ordered/combiner/SequenceRangeAccumulatorCoderTest.java create mode 100644 sdks/java/extensions/ordered/src/test/java/org/apache/beam/sdk/extensions/ordered/combiner/SequenceRangeAccumulatorTest.java diff --git a/CHANGES.md b/CHANGES.md index b9d5f2c191c91..774abefcb0661 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -68,6 +68,7 @@ ## New Features / Improvements * X feature added (Java/Python) ([#X](https://github.com/apache/beam/issues/X)). +* Added support for processing events which use a global sequence to "ordered" extension (Java) [#32540](https://github.com/apache/beam/pull/32540) ## Breaking Changes diff --git a/sdks/java/extensions/ordered/build.gradle b/sdks/java/extensions/ordered/build.gradle index 10c9785b9eed6..8bee1901bd3ad 100644 --- a/sdks/java/extensions/ordered/build.gradle +++ b/sdks/java/extensions/ordered/build.gradle @@ -28,6 +28,12 @@ dependencies { implementation library.java.vendored_guava_32_1_2_jre testImplementation library.java.junit testImplementation library.java.hamcrest + testImplementation library.java.slf4j_jdk14 testImplementation project(path: ':sdks:java:core') + testImplementation 'junit:junit:4.13.1' + testImplementation project(path: ':runners:google-cloud-dataflow-java') testRuntimeOnly project(path: ":runners:direct-java", configuration: "shadow") + testImplementation project(path: ":runners:google-cloud-dataflow-java") + testImplementation project(path: ":sdks:java:extensions:google-cloud-platform-core") + testImplementation project(path: ":sdks:java:io:google-cloud-platform") } \ No newline at end of file diff --git a/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/ContiguousSequenceRange.java b/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/ContiguousSequenceRange.java new file mode 100644 index 0000000000000..c16cf9328dcd6 --- /dev/null +++ b/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/ContiguousSequenceRange.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.extensions.ordered; + +import com.google.auto.value.AutoValue; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import org.apache.beam.sdk.coders.CoderException; +import org.apache.beam.sdk.coders.CustomCoder; +import org.apache.beam.sdk.coders.InstantCoder; +import org.apache.beam.sdk.coders.VarLongCoder; +import org.checkerframework.checker.initialization.qual.Initialized; +import org.checkerframework.checker.nullness.qual.NonNull; +import org.checkerframework.checker.nullness.qual.UnknownKeyFor; +import org.joda.time.Instant; + +/** A range of contiguous event sequences and the latest timestamp of the events in the range. */ +@AutoValue +public abstract class ContiguousSequenceRange { + public static final ContiguousSequenceRange EMPTY = + ContiguousSequenceRange.of( + Long.MIN_VALUE, Long.MIN_VALUE, Instant.ofEpochMilli(Long.MIN_VALUE)); + + /** @return inclusive starting sequence */ + public abstract long getStart(); + + /** @return exclusive end sequence */ + public abstract long getEnd(); + + /** @return latest timestamp of all events in the range */ + public abstract Instant getTimestamp(); + + public static ContiguousSequenceRange of(long start, long end, Instant timestamp) { + return new AutoValue_ContiguousSequenceRange(start, end, timestamp); + } + + static class CompletedSequenceRangeCoder extends CustomCoder { + + private static final CompletedSequenceRangeCoder INSTANCE = new CompletedSequenceRangeCoder(); + + static CompletedSequenceRangeCoder of() { + return INSTANCE; + } + + private CompletedSequenceRangeCoder() {} + + @Override + public void encode( + ContiguousSequenceRange value, @UnknownKeyFor @NonNull @Initialized OutputStream outStream) + throws @UnknownKeyFor @NonNull @Initialized CoderException, @UnknownKeyFor @NonNull + @Initialized IOException { + VarLongCoder.of().encode(value.getStart(), outStream); + VarLongCoder.of().encode(value.getEnd(), outStream); + InstantCoder.of().encode(value.getTimestamp(), outStream); + } + + @Override + public ContiguousSequenceRange decode(@UnknownKeyFor @NonNull @Initialized InputStream inStream) + throws @UnknownKeyFor @NonNull @Initialized CoderException, @UnknownKeyFor @NonNull + @Initialized IOException { + long start = VarLongCoder.of().decode(inStream); + long end = VarLongCoder.of().decode(inStream); + Instant timestamp = InstantCoder.of().decode(inStream); + return ContiguousSequenceRange.of(start, end, timestamp); + } + } +} diff --git a/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/EventExaminer.java b/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/EventExaminer.java index 1e4fe75655178..b5de67f16ceda 100644 --- a/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/EventExaminer.java +++ b/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/EventExaminer.java @@ -31,7 +31,8 @@ public interface EventExaminer> extends Serializable { /** - * Is this event the first expected event for the given key and window? + * Is this event the first expected event for the given key and window if the per key sequence is + * used? In case of global sequence it determines the first global sequence event. * * @param sequenceNumber the sequence number of the event as defined by the key of the input * PCollection to {@link OrderedEventProcessor} @@ -41,8 +42,8 @@ public interface EventExaminer> boolean isInitialEvent(long sequenceNumber, EventT event); /** - * If the event was the first event in the sequence, create the state to hold the required data - * needed for processing. This data will be persisted. + * If the event was the first event for a given key, create the state to hold the required data + * needed for processing. This data will be persisted in a Beam state. * * @param event the first event in the sequence. * @return the state to persist. @@ -53,6 +54,8 @@ public interface EventExaminer> /** * Is this event the last expected event for a given key and window? * + *

Note, this method is not used yet with global sequences. + * * @param sequenceNumber of the event * @param event being processed * @return true if the last event. There are cases where it's impossible to know whether it's the diff --git a/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/GlobalSequenceTracker.java b/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/GlobalSequenceTracker.java new file mode 100644 index 0000000000000..aa12c30a5317c --- /dev/null +++ b/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/GlobalSequenceTracker.java @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.extensions.ordered; + +import org.apache.beam.sdk.extensions.ordered.ContiguousSequenceRange.CompletedSequenceRangeCoder; +import org.apache.beam.sdk.transforms.Combine; +import org.apache.beam.sdk.transforms.PTransform; +import org.apache.beam.sdk.transforms.windowing.AfterFirst; +import org.apache.beam.sdk.transforms.windowing.AfterPane; +import org.apache.beam.sdk.transforms.windowing.AfterProcessingTime; +import org.apache.beam.sdk.transforms.windowing.Repeatedly; +import org.apache.beam.sdk.transforms.windowing.Window; +import org.apache.beam.sdk.transforms.windowing.WindowFn; +import org.apache.beam.sdk.values.KV; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.PCollectionView; +import org.apache.beam.sdk.values.TimestampedValue; +import org.checkerframework.checker.nullness.qual.Nullable; +import org.joda.time.Duration; + +/** + * PTransform to produce the side input of the maximum contiguous range of sequence numbers. + * + * @param type of event key + * @param type of event + * @param type of processing result + * @param type of state + */ +class GlobalSequenceTracker< + EventKeyT, EventT, ResultT, StateT extends MutableState> + extends PTransform< + PCollection>>>, + PCollectionView> { + + private final Combine.GloballyAsSingletonView< + TimestampedValue>>, ContiguousSequenceRange> + sideInputProducer; + private final @Nullable Duration frequencyOfGeneration; + private final int maxElementsBeforeReevaluatingGlobalSequence; + + /** + * Constructor used in batch pipelines. + * + * @param sideInputProducer + */ + public GlobalSequenceTracker( + Combine.GloballyAsSingletonView< + TimestampedValue>>, ContiguousSequenceRange> + sideInputProducer) { + this.sideInputProducer = sideInputProducer; + this.frequencyOfGeneration = null; + this.maxElementsBeforeReevaluatingGlobalSequence = 0; + } + + public GlobalSequenceTracker( + Combine.GloballyAsSingletonView< + TimestampedValue>>, ContiguousSequenceRange> + sideInputProducer, + Duration globalSequenceGenerationFrequency, + int maxElementsBeforeReevaluatingGlobalSequence) { + this.sideInputProducer = sideInputProducer; + this.frequencyOfGeneration = globalSequenceGenerationFrequency; + this.maxElementsBeforeReevaluatingGlobalSequence = maxElementsBeforeReevaluatingGlobalSequence; + } + + @Override + public PCollectionView expand( + PCollection>>> input) { + input + .getPipeline() + .getCoderRegistry() + .registerCoderForClass(ContiguousSequenceRange.class, CompletedSequenceRangeCoder.of()); + + if (frequencyOfGeneration != null) { + // This branch will only be executed in case of streaming pipelines. + // For batch pipelines the side input should only be computed once. + input = + input.apply( + "Triggering Setup", + // Reproduce the windowing of the input PCollection, but change the triggering + // in order to create a slowing changing side input + Window.>>>into( + (WindowFn>>, ?>) + input.getWindowingStrategy().getWindowFn()) + .accumulatingFiredPanes() + .withAllowedLateness(input.getWindowingStrategy().getAllowedLateness()) + .triggering( + Repeatedly.forever( + AfterFirst.of( + AfterPane.elementCountAtLeast( + maxElementsBeforeReevaluatingGlobalSequence), + AfterProcessingTime.pastFirstElementInPane() + .plusDelayOf(frequencyOfGeneration))))); + } + return input.apply("Create Side Input", sideInputProducer); + } +} diff --git a/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/GlobalSequencesProcessorDoFn.java b/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/GlobalSequencesProcessorDoFn.java new file mode 100644 index 0000000000000..64c2d119c97d5 --- /dev/null +++ b/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/GlobalSequencesProcessorDoFn.java @@ -0,0 +1,276 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.extensions.ordered; + +import org.apache.beam.sdk.coders.BooleanCoder; +import org.apache.beam.sdk.coders.Coder; +import org.apache.beam.sdk.extensions.ordered.ProcessingState.ProcessingStateCoder; +import org.apache.beam.sdk.state.OrderedListState; +import org.apache.beam.sdk.state.StateSpec; +import org.apache.beam.sdk.state.StateSpecs; +import org.apache.beam.sdk.state.TimeDomain; +import org.apache.beam.sdk.state.Timer; +import org.apache.beam.sdk.state.TimerSpec; +import org.apache.beam.sdk.state.TimerSpecs; +import org.apache.beam.sdk.state.ValueState; +import org.apache.beam.sdk.transforms.windowing.BoundedWindow; +import org.apache.beam.sdk.values.KV; +import org.apache.beam.sdk.values.PCollectionView; +import org.apache.beam.sdk.values.TupleTag; +import org.checkerframework.checker.nullness.qual.Nullable; +import org.joda.time.Duration; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Main Stateful DoFn used to process events in the global sequence mode. + * + * @param + * @param + * @param + * @param + */ +class GlobalSequencesProcessorDoFn< + EventT, EventKeyT, ResultT, StateT extends MutableState> + extends ProcessorDoFn { + + private static final Logger LOG = LoggerFactory.getLogger(GlobalSequencesProcessorDoFn.class); + + private static final String BATCH_EMISSION_TIMER = "batchTimer"; + + @TimerId(BATCH_EMISSION_TIMER) + @SuppressWarnings("unused") + private final TimerSpec batchTimerSpec = TimerSpecs.timer(TimeDomain.EVENT_TIME); + + private static final String BUFFERED_EVENTS = "bufferedEvents"; + + @StateId(BUFFERED_EVENTS) + @SuppressWarnings("unused") + private final StateSpec> bufferedEventsSpec; + + @StateId(PROCESSING_STATE) + @SuppressWarnings("unused") + private final StateSpec>> processingStateSpec; + + @StateId(MUTABLE_STATE) + @SuppressWarnings("unused") + private final StateSpec> mutableStateSpec; + + @StateId(WINDOW_CLOSED) + @SuppressWarnings("unused") + private final StateSpec> windowClosedSpec; + + @TimerId(STATUS_EMISSION_TIMER) + @SuppressWarnings("unused") + private final TimerSpec statusEmissionTimer = TimerSpecs.timer(TimeDomain.PROCESSING_TIME); + + private final PCollectionView latestContiguousRangeSideInput; + + private final Duration maxLateness; + + GlobalSequencesProcessorDoFn( + EventExaminer eventExaminer, + Coder eventCoder, + Coder stateCoder, + Coder keyCoder, + TupleTag> mainOutputTupleTag, + TupleTag> statusTupleTag, + Duration statusUpdateFrequency, + TupleTag>>> unprocessedEventTupleTag, + boolean produceStatusUpdateOnEveryEvent, + long maxNumberOfResultsToProduce, + PCollectionView latestContiguousRangeSideInput, + Duration maxLateness) { + super( + eventExaminer, + mainOutputTupleTag, + statusTupleTag, + statusUpdateFrequency, + unprocessedEventTupleTag, + produceStatusUpdateOnEveryEvent, + maxNumberOfResultsToProduce); + + this.latestContiguousRangeSideInput = latestContiguousRangeSideInput; + this.bufferedEventsSpec = StateSpecs.orderedList(eventCoder); + this.processingStateSpec = StateSpecs.value(ProcessingStateCoder.of(keyCoder)); + this.mutableStateSpec = StateSpecs.value(stateCoder); + this.windowClosedSpec = StateSpecs.value(BooleanCoder.of()); + this.maxLateness = maxLateness; + } + + @Override + boolean checkForFirstOrLastEvent() { + return false; + } + + @Override + boolean checkForSequenceGapInBufferedEvents() { + return false; + } + + @ProcessElement + public void processElement( + ProcessContext context, + @Element KV> eventAndSequence, + @StateId(BUFFERED_EVENTS) OrderedListState bufferedEventsProxy, + @AlwaysFetched @StateId(PROCESSING_STATE) + ValueState> processingStateProxy, + @StateId(MUTABLE_STATE) ValueState mutableStateProxy, + @TimerId(STATUS_EMISSION_TIMER) Timer statusEmissionTimer, + @TimerId(BATCH_EMISSION_TIMER) Timer batchEmissionTimer, + MultiOutputReceiver outputReceiver, + BoundedWindow window) { + + ContiguousSequenceRange lastContiguousRange = context.sideInput(latestContiguousRangeSideInput); + + EventT event = eventAndSequence.getValue().getValue(); + EventKeyT key = eventAndSequence.getKey(); + long sequence = eventAndSequence.getValue().getKey(); + + if (LOG.isTraceEnabled()) { + LOG.trace(key + ": " + sequence + " lastRange: " + lastContiguousRange); + } + + ProcessingState processingState = processingStateProxy.read(); + + if (processingState == null) { + // This is the first time we see this key/window pair + processingState = new ProcessingState<>(key); + if (statusUpdateFrequency != null) { + // Set up the timer to produce the status of the processing on a regular basis + statusEmissionTimer.offset(statusUpdateFrequency).setRelative(); + } + } + + processingState.updateGlobalSequenceDetails(lastContiguousRange); + + if (event == null) { + // This is a ticker event. We only need to update the state as it relates to the global + // sequence. + processingStateProxy.write(processingState); + + setBatchEmissionTimerIfNeeded(batchEmissionTimer, processingState); + + return; + } + + if (numberOfResultsBeforeBundleStart == null) { + // Per key processing is synchronized by Beam. There is no need to have it here. + numberOfResultsBeforeBundleStart = processingState.getResultCount(); + } + + processingState.eventReceived(); + + StateT state = + processNewEvent( + sequence, + event, + processingState, + mutableStateProxy, + bufferedEventsProxy, + outputReceiver); + + saveStates( + processingStateProxy, + processingState, + mutableStateProxy, + state, + outputReceiver, + window.maxTimestamp()); + + setBatchEmissionTimerIfNeeded(batchEmissionTimer, processingState); + } + + private void setBatchEmissionTimerIfNeeded( + Timer batchEmissionTimer, ProcessingState processingState) { + ContiguousSequenceRange lastCompleteGlobalSequence = processingState.getLastContiguousRange(); + if (lastCompleteGlobalSequence != null + && processingState.thereAreGloballySequencedEventsToBeProcessed()) { + batchEmissionTimer.set(lastCompleteGlobalSequence.getTimestamp().plus(maxLateness)); + } + } + + @OnTimer(BATCH_EMISSION_TIMER) + public void onBatchEmission( + OnTimerContext context, + @StateId(BUFFERED_EVENTS) OrderedListState bufferedEventsState, + @AlwaysFetched @StateId(PROCESSING_STATE) + ValueState> processingStatusState, + @AlwaysFetched @StateId(MUTABLE_STATE) ValueState mutableStateState, + @TimerId(BATCH_EMISSION_TIMER) Timer batchEmissionTimer, + MultiOutputReceiver outputReceiver) { + + // At this point everything in the buffered state is ready to be processed up to the latest + // global sequence. + @Nullable ProcessingState processingState = processingStatusState.read(); + if (processingState == null) { + LOG.warn("Missing the processing state. Probably occurred during pipeline drainage"); + return; + } + + StateT state = mutableStateState.read(); + + ContiguousSequenceRange lastContiguousRange = processingState.getLastContiguousRange(); + if (lastContiguousRange == null) { + LOG.warn("Last complete global instance is null."); + return; + } + + Long earliestBufferedSequence = processingState.getEarliestBufferedSequence(); + if (earliestBufferedSequence == null) { + LOG.warn("Earliest buffered sequence is null."); + return; + } + + if (LOG.isTraceEnabled()) { + LOG.trace("Emission timer: " + processingState); + } + + this.numberOfResultsBeforeBundleStart = processingState.getResultCount(); + + state = + processBufferedEventRange( + processingState, + state, + bufferedEventsState, + outputReceiver, + batchEmissionTimer, + lastContiguousRange); + + saveStates( + processingStatusState, + processingState, + mutableStateState, + state, + outputReceiver, + // TODO: validate that this is correct. + context.window().maxTimestamp()); + } + + @OnTimer(STATUS_EMISSION_TIMER) + @SuppressWarnings("unused") + public void onStatusEmission( + MultiOutputReceiver outputReceiver, + @TimerId(STATUS_EMISSION_TIMER) Timer statusEmissionTimer, + @StateId(WINDOW_CLOSED) ValueState windowClosedState, + @StateId(PROCESSING_STATE) ValueState> processingStateState) { + + processStatusTimerEvent( + outputReceiver, statusEmissionTimer, windowClosedState, processingStateState); + } +} diff --git a/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/OrderedEventProcessor.java b/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/OrderedEventProcessor.java index 935647c0e7e5e..fb23a7c8667af 100644 --- a/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/OrderedEventProcessor.java +++ b/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/OrderedEventProcessor.java @@ -19,52 +19,44 @@ import com.google.auto.value.AutoValue; import java.util.Arrays; -import java.util.Iterator; import javax.annotation.Nullable; import org.apache.beam.sdk.Pipeline; -import org.apache.beam.sdk.coders.BooleanCoder; import org.apache.beam.sdk.coders.CannotProvideCoderException; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.coders.KvCoder; import org.apache.beam.sdk.coders.VarLongCoder; -import org.apache.beam.sdk.extensions.ordered.ProcessingState.ProcessingStateCoder; -import org.apache.beam.sdk.extensions.ordered.UnprocessedEvent.Reason; +import org.apache.beam.sdk.extensions.ordered.OrderedProcessingHandler.OrderedProcessingGlobalSequenceHandler; import org.apache.beam.sdk.extensions.ordered.UnprocessedEvent.UnprocessedEventCoder; import org.apache.beam.sdk.schemas.NoSuchSchemaException; import org.apache.beam.sdk.schemas.SchemaCoder; import org.apache.beam.sdk.schemas.SchemaRegistry; -import org.apache.beam.sdk.state.OrderedListState; -import org.apache.beam.sdk.state.StateSpec; -import org.apache.beam.sdk.state.StateSpecs; -import org.apache.beam.sdk.state.TimeDomain; -import org.apache.beam.sdk.state.Timer; -import org.apache.beam.sdk.state.TimerSpec; -import org.apache.beam.sdk.state.TimerSpecs; -import org.apache.beam.sdk.state.ValueState; import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.Flatten; import org.apache.beam.sdk.transforms.PTransform; import org.apache.beam.sdk.transforms.ParDo; -import org.apache.beam.sdk.transforms.windowing.BoundedWindow; import org.apache.beam.sdk.values.KV; import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.PCollection.IsBounded; +import org.apache.beam.sdk.values.PCollectionList; import org.apache.beam.sdk.values.PCollectionTuple; +import org.apache.beam.sdk.values.PCollectionView; import org.apache.beam.sdk.values.TimestampedValue; import org.apache.beam.sdk.values.TupleTag; import org.apache.beam.sdk.values.TupleTagList; import org.apache.beam.sdk.values.TypeDescriptor; -import org.joda.time.Duration; import org.joda.time.Instant; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; /** * Transform for processing ordered events. Events are grouped by the key and within each key they * are applied according to the provided sequence. Events which arrive out of sequence are buffered * and processed after all the missing events for a given key have arrived. * - * @param - * @param - * @param + *

There are two sequencing modes - a sequence per key and a global sequence. See {@link + * OrderedProcessingHandler} for details on how to configure this transform. + * + * @param type of event + * @param type of event key + * @param type of the state */ @AutoValue @SuppressWarnings({"nullness", "TypeNameShadowing"}) @@ -74,6 +66,18 @@ public abstract class OrderedEventProcessor< PCollection>>, OrderedEventProcessorResult> { + public static final String GLOBAL_SEQUENCE_TRACKER = "global_sequence_tracker"; + + /** + * Create the transform. + * + * @param handler provides the configuration of this transform + * @param type of event + * @param type of event key + * @param type of the result object + * @param type of the state to store + * @return the transform + */ public static < EventTypeT, EventKeyTypeT, @@ -129,10 +133,67 @@ public OrderedEventProcessorResult expand( throw new RuntimeException("Unable to get result coder", e); } - PCollectionTuple processingResult = + KvCoder mainOutputCoder = KvCoder.of(keyCoder, resultCoder); + KvCoder processingStatusCoder = + KvCoder.of(keyCoder, getOrderedProcessingStatusCoder(pipeline)); + KvCoder>> unprocessedEventsCoder = + KvCoder.of( + keyCoder, KvCoder.of(VarLongCoder.of(), new UnprocessedEventCoder<>(eventCoder))); + + if (handler instanceof OrderedProcessingGlobalSequenceHandler) { + OrderedProcessingGlobalSequenceHandler + globalSequenceHandler = + (OrderedProcessingGlobalSequenceHandler) handler; + + return expandGlobalSequenceProcessing( + input, + mainOutput, + statusOutput, + unprocessedEventOutput, + handler, + pipeline, + keyCoder, + eventCoder, + stateCoder, + mainOutputCoder, + processingStatusCoder, + unprocessedEventsCoder, + globalSequenceHandler); + } else { + return expandPerKeyProcessing( + input, + mainOutput, + statusOutput, + unprocessedEventOutput, + handler, + pipeline, + keyCoder, + eventCoder, + stateCoder, + mainOutputCoder, + processingStatusCoder, + unprocessedEventsCoder); + } + } + + private OrderedEventProcessorResult expandPerKeyProcessing( + PCollection>> input, + TupleTag> mainOutput, + TupleTag> statusOutput, + TupleTag>>> unprocessedEventOutput, + OrderedProcessingHandler handler, + Pipeline pipeline, + Coder keyCoder, + Coder eventCoder, + Coder stateCoder, + KvCoder mainOutputCoder, + KvCoder processingStatusCoder, + KvCoder>> unprocessedEventsCoder) { + PCollectionTuple processingResult; + processingResult = input.apply( ParDo.of( - new OrderedProcessorDoFn<>( + new SequencePerKeyProcessorDoFn<>( handler.getEventExaminer(), eventCoder, stateCoder, @@ -146,13 +207,6 @@ public OrderedEventProcessorResult expand( .withOutputTags( mainOutput, TupleTagList.of(Arrays.asList(statusOutput, unprocessedEventOutput)))); - - KvCoder mainOutputCoder = KvCoder.of(keyCoder, resultCoder); - KvCoder processingStatusCoder = - KvCoder.of(keyCoder, getOrderedProcessingStatusCoder(pipeline)); - KvCoder>> unprocessedEventsCoder = - KvCoder.of( - keyCoder, KvCoder.of(VarLongCoder.of(), new UnprocessedEventCoder<>(eventCoder))); return new OrderedEventProcessorResult<>( pipeline, processingResult.get(mainOutput).setCoder(mainOutputCoder), @@ -163,6 +217,84 @@ public OrderedEventProcessorResult expand( unprocessedEventOutput); } + private OrderedEventProcessorResult expandGlobalSequenceProcessing( + PCollection>> input, + TupleTag> mainOutput, + TupleTag> statusOutput, + TupleTag>>> unprocessedEventOutput, + OrderedProcessingHandler handler, + Pipeline pipeline, + Coder keyCoder, + Coder eventCoder, + Coder stateCoder, + KvCoder mainOutputCoder, + KvCoder processingStatusCoder, + KvCoder>> unprocessedEventsCoder, + OrderedProcessingGlobalSequenceHandler + globalSequenceHandler) { + PCollectionTuple processingResult; + boolean streamingProcessing = input.isBounded() == IsBounded.UNBOUNDED; + + final PCollectionView latestContiguousRange = + input + .apply("Convert to SequenceAndTimestamp", ParDo.of(new ToTimestampedEventConverter<>())) + .apply( + "Global Sequence Tracker", + streamingProcessing + ? new GlobalSequenceTracker<>( + globalSequenceHandler.getGlobalSequenceCombiner(), + globalSequenceHandler.getContiguousSequenceRangeReevaluationFrequency(), + globalSequenceHandler + .getMaxElementCountToTriggerContinuousSequenceRangeReevaluation()) + : new GlobalSequenceTracker<>( + globalSequenceHandler.getGlobalSequenceCombiner())); + + if (streamingProcessing) { + PCollection>> tickers = + input.apply( + "Create Tickers", + new PerKeyTickerGenerator<>( + keyCoder, + eventCoder, + globalSequenceHandler.getContiguousSequenceRangeReevaluationFrequency())); + + input = + PCollectionList.of(input) + .and(tickers) + .apply("Combine Events and Tickers", Flatten.pCollections()) + .setCoder(tickers.getCoder()); + } + processingResult = + input.apply( + ParDo.of( + new GlobalSequencesProcessorDoFn<>( + handler.getEventExaminer(), + eventCoder, + stateCoder, + keyCoder, + mainOutput, + statusOutput, + handler.getStatusUpdateFrequency(), + unprocessedEventOutput, + handler.isProduceStatusUpdateOnEveryEvent(), + handler.getMaxOutputElementsPerBundle(), + latestContiguousRange, + input.getWindowingStrategy().getAllowedLateness())) + .withOutputTags( + mainOutput, + TupleTagList.of(Arrays.asList(statusOutput, unprocessedEventOutput))) + .withSideInput(GLOBAL_SEQUENCE_TRACKER, latestContiguousRange)); + return new OrderedEventProcessorResult<>( + pipeline, + processingResult.get(mainOutput).setCoder(mainOutputCoder), + mainOutput, + processingResult.get(statusOutput).setCoder(processingStatusCoder), + statusOutput, + processingResult.get(unprocessedEventOutput).setCoder(unprocessedEventsCoder), + unprocessedEventOutput, + latestContiguousRange); + } + private static Coder getOrderedProcessingStatusCoder(Pipeline pipeline) { SchemaRegistry schemaRegistry = pipeline.getSchemaRegistry(); Coder result; @@ -179,497 +311,16 @@ private static Coder getOrderedProcessingStatusCoder(Pi return result; } - /** - * Main DoFn for processing ordered events. - * - * @param - * @param - * @param - */ - static class OrderedProcessorDoFn< - EventTypeT, - EventKeyTypeT, - ResultTypeT, - StateTypeT extends MutableState> - extends DoFn>, KV> { - - private static final Logger LOG = LoggerFactory.getLogger(OrderedProcessorDoFn.class); - - private static final String PROCESSING_STATE = "processingState"; - private static final String MUTABLE_STATE = "mutableState"; - private static final String BUFFERED_EVENTS = "bufferedEvents"; - private static final String STATUS_EMISSION_TIMER = "statusTimer"; - private static final String LARGE_BATCH_EMISSION_TIMER = "largeBatchTimer"; - private static final String WINDOW_CLOSED = "windowClosed"; - private final EventExaminer eventExaminer; - - @StateId(BUFFERED_EVENTS) - @SuppressWarnings("unused") - private final StateSpec> bufferedEventsSpec; - - @StateId(PROCESSING_STATE) - @SuppressWarnings("unused") - private final StateSpec>> processingStateSpec; - - @SuppressWarnings("unused") - @StateId(MUTABLE_STATE) - private final StateSpec> mutableStateSpec; - - @StateId(WINDOW_CLOSED) - @SuppressWarnings("unused") - private final StateSpec> windowClosedSpec; - - @TimerId(STATUS_EMISSION_TIMER) - @SuppressWarnings("unused") - private final TimerSpec statusEmissionTimer = TimerSpecs.timer(TimeDomain.PROCESSING_TIME); - - @TimerId(LARGE_BATCH_EMISSION_TIMER) - @SuppressWarnings("unused") - private final TimerSpec largeBatchEmissionTimer = TimerSpecs.timer(TimeDomain.EVENT_TIME); - - private final TupleTag> statusTupleTag; - private final Duration statusUpdateFrequency; - - private final TupleTag> mainOutputTupleTag; - private final TupleTag>>> - unprocessedEventsTupleTag; - private final boolean produceStatusUpdateOnEveryEvent; - - private final long maxNumberOfResultsToProduce; - - private Long numberOfResultsBeforeBundleStart; - - /** - * Stateful DoFn to do the bulk of processing. - * - * @param eventExaminer - * @param eventCoder - * @param stateCoder - * @param keyCoder - * @param mainOutputTupleTag - * @param statusTupleTag - * @param statusUpdateFrequency - * @param unprocessedEventTupleTag - * @param produceStatusUpdateOnEveryEvent - * @param maxNumberOfResultsToProduce - */ - OrderedProcessorDoFn( - EventExaminer eventExaminer, - Coder eventCoder, - Coder stateCoder, - Coder keyCoder, - TupleTag> mainOutputTupleTag, - TupleTag> statusTupleTag, - Duration statusUpdateFrequency, - TupleTag>>> - unprocessedEventTupleTag, - boolean produceStatusUpdateOnEveryEvent, - long maxNumberOfResultsToProduce) { - this.eventExaminer = eventExaminer; - this.bufferedEventsSpec = StateSpecs.orderedList(eventCoder); - this.mutableStateSpec = StateSpecs.value(stateCoder); - this.processingStateSpec = StateSpecs.value(ProcessingStateCoder.of(keyCoder)); - this.windowClosedSpec = StateSpecs.value(BooleanCoder.of()); - this.mainOutputTupleTag = mainOutputTupleTag; - this.statusTupleTag = statusTupleTag; - this.unprocessedEventsTupleTag = unprocessedEventTupleTag; - this.statusUpdateFrequency = statusUpdateFrequency; - this.produceStatusUpdateOnEveryEvent = produceStatusUpdateOnEveryEvent; - this.maxNumberOfResultsToProduce = maxNumberOfResultsToProduce; - } - - @StartBundle - public void onBundleStart() { - numberOfResultsBeforeBundleStart = null; - } - - @FinishBundle - public void onBundleFinish() { - // This might be necessary because this field is also used in a Timer - numberOfResultsBeforeBundleStart = null; - } + static class ToTimestampedEventConverter + extends DoFn< + KV>, TimestampedValue>>> { @ProcessElement - public void processElement( - @StateId(BUFFERED_EVENTS) OrderedListState bufferedEventsState, - @AlwaysFetched @StateId(PROCESSING_STATE) - ValueState> processingStateState, - @StateId(MUTABLE_STATE) ValueState mutableStateState, - @TimerId(STATUS_EMISSION_TIMER) Timer statusEmissionTimer, - @TimerId(LARGE_BATCH_EMISSION_TIMER) Timer largeBatchEmissionTimer, - @Element KV> eventAndSequence, - MultiOutputReceiver outputReceiver, - BoundedWindow window) { - - EventKeyTypeT key = eventAndSequence.getKey(); - long sequence = eventAndSequence.getValue().getKey(); - EventTypeT event = eventAndSequence.getValue().getValue(); - - ProcessingState processingState = processingStateState.read(); - - if (processingState == null) { - // This is the first time we see this key/window pair - processingState = new ProcessingState<>(key); - if (statusUpdateFrequency != null) { - // Set up the timer to produce the status of the processing on a regular basis - statusEmissionTimer.offset(statusUpdateFrequency).setRelative(); - } - } - - if (numberOfResultsBeforeBundleStart == null) { - // Per key processing is synchronized by Beam. There is no need to have it here. - numberOfResultsBeforeBundleStart = processingState.getResultCount(); - } - - processingState.eventReceived(); - - StateTypeT state = - processNewEvent( - sequence, - event, - processingState, - mutableStateState, - bufferedEventsState, - outputReceiver); - - processBufferedEvents( - processingState, state, bufferedEventsState, outputReceiver, largeBatchEmissionTimer); - - saveStates( - processingStateState, - processingState, - mutableStateState, - state, - outputReceiver, - window.maxTimestamp()); - - checkIfProcessingIsCompleted(processingState); - } - - private boolean checkIfProcessingIsCompleted(ProcessingState processingState) { - boolean result = processingState.isProcessingCompleted(); - if (result) { - LOG.info("Processing for key '" + processingState.getKey() + "' is completed."); - } - return result; - } - - private void saveStates( - ValueState> processingStatusState, - ProcessingState processingStatus, - ValueState currentStateState, - StateTypeT state, - MultiOutputReceiver outputReceiver, - Instant windowTimestamp) { - // There is always a change to the processing status - processingStatusState.write(processingStatus); - - // Stored state may not have changes if the element was out of sequence. - if (state != null) { - currentStateState.write(state); - } - - if (produceStatusUpdateOnEveryEvent) { - // During pipeline draining the window timestamp is set to a large value in the future. - // Producing an event before that results in error, that's why this logic exist. - Instant statusTimestamp = windowTimestamp; - - emitProcessingStatus(processingStatus, outputReceiver, statusTimestamp); - } - } - - private void emitProcessingStatus( - ProcessingState processingState, - MultiOutputReceiver outputReceiver, - Instant statusTimestamp) { - outputReceiver - .get(statusTupleTag) - .outputWithTimestamp( - KV.of( - processingState.getKey(), - OrderedProcessingStatus.create( - processingState.getLastOutputSequence(), - processingState.getBufferedEventCount(), - processingState.getEarliestBufferedSequence(), - processingState.getLatestBufferedSequence(), - processingState.getEventsReceived(), - processingState.getResultCount(), - processingState.getDuplicates(), - processingState.isLastEventReceived())), - statusTimestamp); - } - - /** - * Process the just received event. - * - * @return newly created or updated State. If null is returned - the event wasn't processed. - */ - private StateTypeT processNewEvent( - long currentSequence, - EventTypeT currentEvent, - ProcessingState processingState, - ValueState currentStateState, - OrderedListState bufferedEventsState, - MultiOutputReceiver outputReceiver) { - if (currentSequence == Long.MAX_VALUE) { - // OrderedListState can't handle the timestamp based on MAX_VALUE. - // To avoid exceptions, we DLQ this event. - outputReceiver - .get(unprocessedEventsTupleTag) - .output( - KV.of( - processingState.getKey(), - KV.of( - currentSequence, - UnprocessedEvent.create( - currentEvent, Reason.sequence_id_outside_valid_range)))); - return null; - } - - if (processingState.hasAlreadyBeenProcessed(currentSequence)) { - outputReceiver - .get(unprocessedEventsTupleTag) - .output( - KV.of( - processingState.getKey(), - KV.of( - currentSequence, UnprocessedEvent.create(currentEvent, Reason.duplicate)))); - return null; - } - - StateTypeT state; - boolean thisIsTheLastEvent = eventExaminer.isLastEvent(currentSequence, currentEvent); - if (eventExaminer.isInitialEvent(currentSequence, currentEvent)) { - // First event of the key/window - // What if it's a duplicate event - it will reset everything. Shall we drop/DLQ anything - // that's before the processingState.lastOutputSequence? - state = eventExaminer.createStateOnInitialEvent(currentEvent); - - processingState.eventAccepted(currentSequence, thisIsTheLastEvent); - - ResultTypeT result = state.produceResult(); - if (result != null) { - outputReceiver.get(mainOutputTupleTag).output(KV.of(processingState.getKey(), result)); - processingState.resultProduced(); - } - - // Nothing else to do. We will attempt to process buffered events later. - return state; - } - - if (processingState.isNextEvent(currentSequence)) { - // Event matches expected sequence - state = currentStateState.read(); - - try { - state.mutate(currentEvent); - } catch (Exception e) { - outputReceiver - .get(unprocessedEventsTupleTag) - .output( - KV.of( - processingState.getKey(), - KV.of(currentSequence, UnprocessedEvent.create(currentEvent, e)))); - return null; - } - - ResultTypeT result = state.produceResult(); - if (result != null) { - outputReceiver.get(mainOutputTupleTag).output(KV.of(processingState.getKey(), result)); - processingState.resultProduced(); - } - processingState.eventAccepted(currentSequence, thisIsTheLastEvent); - - return state; - } - - // Event is not ready to be processed yet - Instant eventTimestamp = Instant.ofEpochMilli(currentSequence); - bufferedEventsState.add(TimestampedValue.of(currentEvent, eventTimestamp)); - processingState.eventBuffered(currentSequence, thisIsTheLastEvent); - - // This will signal that the state hasn't been mutated and we don't need to save it. - return null; - } - - /** Process buffered events. */ - private void processBufferedEvents( - ProcessingState processingState, - StateTypeT state, - OrderedListState bufferedEventsState, - MultiOutputReceiver outputReceiver, - Timer largeBatchEmissionTimer) { - if (state == null) { - // Only when the current event caused a state mutation and the state is passed to this - // method should we attempt to process buffered events - return; - } - - if (!processingState.readyToProcessBufferedEvents()) { - return; - } - - if (reachedMaxResultCountForBundle(processingState, largeBatchEmissionTimer)) { - // No point in trying to process buffered events - return; - } - - Instant startRange = Instant.ofEpochMilli(processingState.getEarliestBufferedSequence()); - Instant endRange = Instant.ofEpochMilli(processingState.getLatestBufferedSequence() + 1); - Instant endClearRange = null; - - // readRange is efficiently implemented and will bring records in batches - Iterable> events = - bufferedEventsState.readRange(startRange, endRange); - - Iterator> bufferedEventsIterator = events.iterator(); - while (bufferedEventsIterator.hasNext()) { - TimestampedValue timestampedEvent = bufferedEventsIterator.next(); - Instant eventTimestamp = timestampedEvent.getTimestamp(); - long eventSequence = eventTimestamp.getMillis(); - - EventTypeT bufferedEvent = timestampedEvent.getValue(); - if (processingState.checkForDuplicateBatchedEvent(eventSequence)) { - outputReceiver - .get(unprocessedEventsTupleTag) - .output( - KV.of( - processingState.getKey(), - KV.of( - eventSequence, - UnprocessedEvent.create(bufferedEvent, Reason.duplicate)))); - continue; - } - - if (eventSequence > processingState.getLastOutputSequence() + 1) { - processingState.foundSequenceGap(eventSequence); - // Records will be cleared up to this element - endClearRange = Instant.ofEpochMilli(eventSequence); - break; - } - - // This check needs to be done after we checked for sequence gap and before we - // attempt to process the next element which can result in a new result. - if (reachedMaxResultCountForBundle(processingState, largeBatchEmissionTimer)) { - endClearRange = Instant.ofEpochMilli(eventSequence); - break; - } - - try { - state.mutate(bufferedEvent); - } catch (Exception e) { - outputReceiver - .get(unprocessedEventsTupleTag) - .output( - KV.of( - processingState.getKey(), - KV.of(eventSequence, UnprocessedEvent.create(bufferedEvent, e)))); - // There is a chance that the next event will have the same sequence number and will - // process successfully. - continue; - } - - ResultTypeT result = state.produceResult(); - if (result != null) { - outputReceiver.get(mainOutputTupleTag).output(KV.of(processingState.getKey(), result)); - processingState.resultProduced(); - } - processingState.processedBufferedEvent(eventSequence); - // Remove this record also - endClearRange = Instant.ofEpochMilli(eventSequence + 1); - } - - bufferedEventsState.clearRange(startRange, endClearRange); - } - - private boolean reachedMaxResultCountForBundle( - ProcessingState processingState, Timer largeBatchEmissionTimer) { - boolean exceeded = - processingState.resultsProducedInBundle(numberOfResultsBeforeBundleStart) - >= maxNumberOfResultsToProduce; - if (exceeded) { - LOG.info( - "Setting the timer to output next batch of events for key '" - + processingState.getKey() - + "'"); - // See GroupIntoBatches for examples on how to hold the timestamp. - // TODO: test that on draining the pipeline all the results are still produced correctly. - // See: https://github.com/apache/beam/issues/30781 - largeBatchEmissionTimer.offset(Duration.millis(1)).setRelative(); - } - return exceeded; - } - - @OnTimer(LARGE_BATCH_EMISSION_TIMER) - public void onBatchEmission( - OnTimerContext context, - @StateId(BUFFERED_EVENTS) OrderedListState bufferedEventsState, - @AlwaysFetched @StateId(PROCESSING_STATE) - ValueState> processingStatusState, - @AlwaysFetched @StateId(MUTABLE_STATE) ValueState currentStateState, - @TimerId(LARGE_BATCH_EMISSION_TIMER) Timer largeBatchEmissionTimer, - MultiOutputReceiver outputReceiver) { - ProcessingState processingState = processingStatusState.read(); - if (processingState == null) { - LOG.warn("Processing state is empty. Ignore it if the pipeline is being cancelled."); - return; - } - StateTypeT state = currentStateState.read(); - if (state == null) { - LOG.warn("Mutable state is empty. Ignore it if the pipeline is being cancelled."); - return; - } - - LOG.debug("Starting to process batch for key '" + processingState.getKey() + "'"); - - this.numberOfResultsBeforeBundleStart = processingState.getResultCount(); - - processBufferedEvents( - processingState, state, bufferedEventsState, outputReceiver, largeBatchEmissionTimer); - - saveStates( - processingStatusState, - processingState, - currentStateState, - state, - outputReceiver, - // TODO: validate that this is correct. - context.window().maxTimestamp()); - - checkIfProcessingIsCompleted(processingState); - } - - @OnTimer(STATUS_EMISSION_TIMER) - @SuppressWarnings("unused") - public void onStatusEmission( - MultiOutputReceiver outputReceiver, - @TimerId(STATUS_EMISSION_TIMER) Timer statusEmissionTimer, - @StateId(WINDOW_CLOSED) ValueState windowClosedState, - @StateId(PROCESSING_STATE) - ValueState> processingStateState) { - - ProcessingState currentState = processingStateState.read(); - if (currentState == null) { - // This could happen if the state has been purged already during the draining. - // It means that there is nothing that we can do and we just need to return. - LOG.warn( - "Current processing state is null in onStatusEmission() - most likely the pipeline is shutting down."); - return; - } - - emitProcessingStatus(currentState, outputReceiver, Instant.now()); - - Boolean windowClosed = windowClosedState.read(); - if (!currentState.isProcessingCompleted() - // Stop producing statuses if we are finished for a particular key - && (windowClosed == null || !windowClosed)) { - statusEmissionTimer.offset(statusUpdateFrequency).setRelative(); - } - } - - @OnWindowExpiration - public void onWindowExpiration(@StateId(WINDOW_CLOSED) ValueState windowClosedState) { - windowClosedState.write(true); + public void convert( + @Element KV> element, + @Timestamp Instant timestamp, + OutputReceiver>>> outputReceiver) { + outputReceiver.output(TimestampedValue.of(element, timestamp)); } } } diff --git a/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/OrderedEventProcessorResult.java b/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/OrderedEventProcessorResult.java index f61df6254b253..48b9fafc99af7 100644 --- a/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/OrderedEventProcessorResult.java +++ b/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/OrderedEventProcessorResult.java @@ -18,10 +18,12 @@ package org.apache.beam.sdk.extensions.ordered; import java.util.Map; +import javax.annotation.Nullable; import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.transforms.PTransform; import org.apache.beam.sdk.values.KV; import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.PCollectionView; import org.apache.beam.sdk.values.PInput; import org.apache.beam.sdk.values.POutput; import org.apache.beam.sdk.values.PValue; @@ -29,10 +31,15 @@ import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; /** - * The result of the ordered processing. Two PCollections are returned: + * The result of the ordered processing. Three PCollections are returned: *

  • output - the key/value of the mutated states + *
  • unprocessedEvents - the key/value of the events that failed to be processed and the failure + * reason *
  • processingStatuses - the key/value of the status of processing for a particular key * + *

    In case of global sequence processing, the result also contains PCollectionView of the + * latest contiguous sequence range + * * @param * @param */ @@ -48,6 +55,8 @@ public class OrderedEventProcessorResult implements POutp unprocessedEventPCollection; private final TupleTag>>> unprocessedEventTupleTag; + private final @Nullable PCollectionView latestContiguousRange; + OrderedEventProcessorResult( Pipeline pipeline, PCollection> outputPCollection, @@ -57,6 +66,27 @@ public class OrderedEventProcessorResult implements POutp PCollection>>> unprocessedEventPCollection, TupleTag>>> unprocessedEventTupleTag) { + this( + pipeline, + outputPCollection, + outputPCollectionTupleTag, + eventProcessingStatusPCollection, + eventProcessingStatusTupleTag, + unprocessedEventPCollection, + unprocessedEventTupleTag, + null); + } + + OrderedEventProcessorResult( + Pipeline pipeline, + PCollection> outputPCollection, + TupleTag> outputPCollectionTupleTag, + PCollection> eventProcessingStatusPCollection, + TupleTag> eventProcessingStatusTupleTag, + PCollection>>> unprocessedEventPCollection, + TupleTag>>> unprocessedEventTupleTag, + @Nullable PCollectionView latestContiguousRange) { + this.pipeline = pipeline; this.outputPCollection = outputPCollection; this.outputPCollectionTupleTag = outputPCollectionTupleTag; @@ -64,6 +94,7 @@ public class OrderedEventProcessorResult implements POutp this.eventProcessingStatusTupleTag = eventProcessingStatusTupleTag; this.unprocessedEventPCollection = unprocessedEventPCollection; this.unprocessedEventTupleTag = unprocessedEventTupleTag; + this.latestContiguousRange = latestContiguousRange; } private final Pipeline pipeline; @@ -104,4 +135,8 @@ public PCollection> output() { public PCollection>>> unprocessedEvents() { return unprocessedEventPCollection; } + + public @Nullable PCollectionView latestContiguousRange() { + return latestContiguousRange; + } } diff --git a/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/OrderedProcessingHandler.java b/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/OrderedProcessingHandler.java index 444fdb118091b..d8ad13330a1a9 100644 --- a/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/OrderedProcessingHandler.java +++ b/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/OrderedProcessingHandler.java @@ -22,7 +22,11 @@ import org.apache.beam.sdk.coders.CannotProvideCoderException; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.coders.KvCoder; +import org.apache.beam.sdk.extensions.ordered.combiner.DefaultSequenceCombiner; +import org.apache.beam.sdk.transforms.Combine; +import org.apache.beam.sdk.transforms.Combine.GloballyAsSingletonView; import org.apache.beam.sdk.values.KV; +import org.apache.beam.sdk.values.TimestampedValue; import org.checkerframework.checker.nullness.qual.NonNull; import org.checkerframework.checker.nullness.qual.Nullable; import org.joda.time.Duration; @@ -30,6 +34,11 @@ /** * Parent class for Ordered Processing configuration handlers. * + *

    There are two types of processing - when the sequence numbers are contiguous per key and these + * sequences per keys are independent of each other, and when there is a global sequence shared by + * all keys. In case of the global sequence processing the custom handler must extend from {@see + * OrderedProcessingGlobalSequenceHandler}. + * * @param type of events to be processed * @param type of keys which will be used to group the events * @param type of internal State which will be used for processing @@ -217,4 +226,75 @@ public int getMaxOutputElementsPerBundle() { public void setMaxOutputElementsPerBundle(int maxOutputElementsPerBundle) { this.maxOutputElementsPerBundle = maxOutputElementsPerBundle; } + + /** + * Parent class for Ordered Processing configuration handlers to handle processing of the events + * where global sequence is used. + * + * @param type of events to be processed + * @param type of keys which will be used to group the events + * @param type of internal State which will be used for processing + * @param type of the result of the processing which will be output + */ + public abstract static class OrderedProcessingGlobalSequenceHandler< + EventT, KeyT, StateT extends MutableState, ResultT> + extends OrderedProcessingHandler { + + public OrderedProcessingGlobalSequenceHandler( + Class eventTClass, + Class keyTClass, + Class stateTClass, + Class resultTClass) { + super(eventTClass, keyTClass, stateTClass, resultTClass); + } + + /** + * Provide the global sequence combiner. Default is to use {@link DefaultSequenceCombiner}. + * + * @return combiner + */ + public GloballyAsSingletonView< + TimestampedValue>>, ContiguousSequenceRange> + getGlobalSequenceCombiner() { + return Combine.globally(new DefaultSequenceCombiner(getEventExaminer())) + .asSingletonView(); + } + + /** + * How frequently the combiner should reevaluate the maximum range? This parameter only affects + * the behaviour of streaming pipelines. + * + *

    This parameter is used together with {@link + * OrderedProcessingGlobalSequenceHandler#getMaxElementCountToTriggerContinuousSequenceRangeReevaluation()}. + * The re-evaluation will occur as soon as the number of new elements exceeds the threshold or + * the time exceeds the frequency. + * + *

    Notice that some runners cache the output of side inputs and this parameter might not + * appear to have an effect unless the cache time-to-live is equal or less than this frequency. + * For Dataflow runner, see {@link this + * Dataflow streaming pipeline option} + * + * @return frequency of reevaluating the {@link ContiguousSequenceRange}. Default - every + * second. + * @see + * OrderedProcessingGlobalSequenceHandler#getMaxElementCountToTriggerContinuousSequenceRangeReevaluation() + */ + public Duration getContiguousSequenceRangeReevaluationFrequency() { + return Duration.standardSeconds(1); + } + + /** + * Number of new elements to trigger the re-evaluation. + * + *

    See {@link + * OrderedProcessingGlobalSequenceHandler#getContiguousSequenceRangeReevaluationFrequency()} for + * additional details. + * + * @return batch size. Default - 1000. + */ + public int getMaxElementCountToTriggerContinuousSequenceRangeReevaluation() { + return 1000; + } + } } diff --git a/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/OrderedProcessingStatus.java b/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/OrderedProcessingStatus.java index 6659bd2e2b922..7a556de1017b7 100644 --- a/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/OrderedProcessingStatus.java +++ b/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/OrderedProcessingStatus.java @@ -30,16 +30,16 @@ public abstract class OrderedProcessingStatus { public static OrderedProcessingStatus create( - Long lastOutputSequence, + @Nullable Long lastProcessedSequence, long numberOfBufferedEvents, - Long earliestBufferedSequence, - Long latestBufferedSequence, + @Nullable Long earliestBufferedSequence, + @Nullable Long latestBufferedSequence, long numberOfReceivedEvents, long resultCount, long duplicateCount, boolean lastEventReceived) { return new AutoValue_OrderedProcessingStatus.Builder() - .setLastProcessedSequence(lastOutputSequence) + .setLastProcessedSequence(lastProcessedSequence) .setNumberOfBufferedEvents(numberOfBufferedEvents) .setEarliestBufferedSequence(earliestBufferedSequence) .setLatestBufferedSequence(latestBufferedSequence) @@ -55,8 +55,7 @@ public static OrderedProcessingStatus create( * @return Last sequence processed. If null is returned - no elements for the given key and window * have been processed yet. */ - @Nullable - public abstract Long getLastProcessedSequence(); + public abstract @Nullable Long getLastProcessedSequence(); /** @return Number of events received out of sequence and buffered. */ public abstract long getNumberOfBufferedEvents(); @@ -129,13 +128,13 @@ public final int hashCode() { @AutoValue.Builder public abstract static class Builder { - public abstract Builder setLastProcessedSequence(Long value); + public abstract Builder setLastProcessedSequence(@Nullable Long value); public abstract Builder setNumberOfBufferedEvents(long value); - public abstract Builder setEarliestBufferedSequence(Long value); + public abstract Builder setEarliestBufferedSequence(@Nullable Long value); - public abstract Builder setLatestBufferedSequence(Long value); + public abstract Builder setLatestBufferedSequence(@Nullable Long value); public abstract Builder setNumberOfReceivedEvents(long value); diff --git a/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/PerKeyTickerGenerator.java b/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/PerKeyTickerGenerator.java new file mode 100644 index 0000000000000..a18ba53f5266c --- /dev/null +++ b/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/PerKeyTickerGenerator.java @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.extensions.ordered; + +import org.apache.beam.sdk.coders.Coder; +import org.apache.beam.sdk.coders.KvCoder; +import org.apache.beam.sdk.coders.NullableCoder; +import org.apache.beam.sdk.coders.VarLongCoder; +import org.apache.beam.sdk.state.StateSpec; +import org.apache.beam.sdk.state.StateSpecs; +import org.apache.beam.sdk.state.TimeDomain; +import org.apache.beam.sdk.state.Timer; +import org.apache.beam.sdk.state.TimerSpec; +import org.apache.beam.sdk.state.TimerSpecs; +import org.apache.beam.sdk.state.ValueState; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.PTransform; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.values.KV; +import org.apache.beam.sdk.values.PCollection; +import org.checkerframework.checker.initialization.qual.Initialized; +import org.checkerframework.checker.nullness.qual.NonNull; +import org.checkerframework.checker.nullness.qual.Nullable; +import org.checkerframework.checker.nullness.qual.UnknownKeyFor; +import org.joda.time.Duration; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * PTransform to generate per key tickers with certain frequency. + * + * @param + * @param + */ +class PerKeyTickerGenerator + extends PTransform< + PCollection>>, + PCollection>>> { + + private static final Logger LOG = LoggerFactory.getLogger(PerKeyTickerGenerator.class); + + private final Coder eventKeyCoder; + private final Coder eventCoder; + private final Duration tickerFrequency; + + PerKeyTickerGenerator( + Coder eventKeyCoder, Coder eventCoder, Duration tickerFrequency) { + this.eventKeyCoder = eventKeyCoder; + this.eventCoder = eventCoder; + this.tickerFrequency = tickerFrequency; + } + + @Override + public @UnknownKeyFor @NonNull @Initialized PCollection>> expand( + PCollection>> input) { + return input + .apply( + "Generate Tickers", + ParDo.of(new PerKeyTickerGeneratorDoFn<>(eventKeyCoder, tickerFrequency))) + .setCoder( + KvCoder.of(eventKeyCoder, KvCoder.of(VarLongCoder.of(), NullableCoder.of(eventCoder)))); + } + + static class PerKeyTickerGeneratorDoFn + extends DoFn>, KV>> { + + private static final String STATE = "state"; + private static final String TIMER = "timer"; + + @StateId(STATE) + @SuppressWarnings("unused") + private final StateSpec> stateSpec; + + @TimerId(TIMER) + @SuppressWarnings("unused") + private final TimerSpec tickerTimer = TimerSpecs.timer(TimeDomain.PROCESSING_TIME); + + private final Duration tickerFrequency; + + PerKeyTickerGeneratorDoFn(Coder keyCoder, Duration tickerFrequency) { + stateSpec = StateSpecs.value(keyCoder); + this.tickerFrequency = tickerFrequency; + } + + @ProcessElement + public void process( + @Element KV> element, + @AlwaysFetched @StateId(STATE) ValueState state, + @TimerId(TIMER) Timer tickerTimer) { + @Nullable EventKeyT keyValue = state.read(); + if (keyValue != null) { + return; + } + + tickerTimer.offset(tickerFrequency).setRelative(); + + state.write(element.getKey()); + } + + @OnTimer(TIMER) + public void onTimer( + @StateId(STATE) ValueState state, + @TimerId(TIMER) Timer tickerTimer, + OutputReceiver>> outputReceiver) { + + @Nullable EventKeyT key = state.read(); + if (key == null) { + LOG.error("Expected to get the key from the state, but got null"); + return; + } + + // Null value will be an indicator to the main transform that the element is a ticker + outputReceiver.output(KV.of(key, KV.of(0L, null))); + tickerTimer.offset(tickerFrequency).setRelative(); + } + } +} diff --git a/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/ProcessingState.java b/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/ProcessingState.java index 4b591a37faab8..425eb4444a634 100644 --- a/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/ProcessingState.java +++ b/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/ProcessingState.java @@ -51,6 +51,8 @@ class ProcessingState { private long resultCount; + @Nullable private ContiguousSequenceRange lastCompleteGlobalSequence; + private KeyT key; public ProcessingState(KeyT key) { @@ -59,6 +61,7 @@ public ProcessingState(KeyT key) { this.lastOutputSequence = null; this.earliestBufferedSequence = null; this.latestBufferedSequence = null; + this.lastCompleteGlobalSequence = null; } /** @@ -130,6 +133,15 @@ public KeyT getKey() { return key; } + public @Nullable ContiguousSequenceRange getLastContiguousRange() { + return lastCompleteGlobalSequence; + } + + public void setLastCompleteGlobalSequence( + @Nullable ContiguousSequenceRange lastCompleteGlobalSequence) { + this.lastCompleteGlobalSequence = lastCompleteGlobalSequence; + } + /** * Current event matched the sequence and was processed. * @@ -229,6 +241,32 @@ public int hashCode() { key); } + @Override + public String toString() { + return "ProcessingState{" + + "lastOutputSequence=" + + lastOutputSequence + + ", latestBufferedSequence=" + + latestBufferedSequence + + ", earliestBufferedSequence=" + + earliestBufferedSequence + + ", bufferedEventCount=" + + bufferedEventCount + + ", lastEventReceived=" + + lastEventReceived + + ", eventsReceived=" + + eventsReceived + + ", duplicates=" + + duplicates + + ", resultCount=" + + resultCount + + ", lastCompleteGlobalSequence=" + + lastCompleteGlobalSequence + + ", key=" + + key + + '}'; + } + public boolean isProcessingCompleted() { return lastEventReceived && bufferedEventCount == 0; } @@ -274,6 +312,23 @@ public long resultsProducedInBundle(long numberOfResultsBeforeBundleStart) { return resultCount - numberOfResultsBeforeBundleStart; } + public void updateGlobalSequenceDetails(ContiguousSequenceRange updated) { + if (thereAreGloballySequencedEventsToBeProcessed()) { + // We don't update the timer if we can already process events in the onTimer batch. + // Otherwise, it's possible that we will be pushing the timer to later timestamps + // without a chance to run and produce output. + return; + } + this.lastCompleteGlobalSequence = updated; + } + + public boolean thereAreGloballySequencedEventsToBeProcessed() { + return bufferedEventCount > 0 + && lastCompleteGlobalSequence != null + && earliestBufferedSequence != null + && earliestBufferedSequence < lastCompleteGlobalSequence.getEnd(); + } + /** * Coder for the processing status. * @@ -287,6 +342,9 @@ static class ProcessingStateCoder extends Coder> { private static final VarIntCoder INTEGER_CODER = VarIntCoder.of(); private static final BooleanCoder BOOLEAN_CODER = BooleanCoder.of(); + private static final NullableCoder SEQUENCE_AND_TIMESTAMP_CODER = + NullableCoder.of(ContiguousSequenceRange.CompletedSequenceRangeCoder.of()); + private Coder keyCoder; private ProcessingStateCoder(Coder keyCoder) { @@ -308,6 +366,7 @@ public void encode(ProcessingState value, OutputStream outStream) throws I LONG_CODER.encode(value.getResultCount(), outStream); BOOLEAN_CODER.encode(value.isLastEventReceived(), outStream); keyCoder.encode(value.getKey(), outStream); + SEQUENCE_AND_TIMESTAMP_CODER.encode(value.getLastContiguousRange(), outStream); } @Override @@ -321,17 +380,23 @@ public ProcessingState decode(InputStream inStream) throws IOException { long resultCount = LONG_CODER.decode(inStream); boolean isLastEventReceived = BOOLEAN_CODER.decode(inStream); KeyT key = keyCoder.decode(inStream); - - return new ProcessingState<>( - key, - lastOutputSequence, - earliestBufferedSequence, - latestBufferedSequence, - bufferedRecordCount, - recordsReceivedCount, - duplicates, - resultCount, - isLastEventReceived); + ContiguousSequenceRange lastCompleteGlobalSequence = + SEQUENCE_AND_TIMESTAMP_CODER.decode(inStream); + + ProcessingState result = + new ProcessingState<>( + key, + lastOutputSequence, + earliestBufferedSequence, + latestBufferedSequence, + bufferedRecordCount, + recordsReceivedCount, + duplicates, + resultCount, + isLastEventReceived); + result.setLastCompleteGlobalSequence(lastCompleteGlobalSequence); + + return result; } @Override diff --git a/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/ProcessorDoFn.java b/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/ProcessorDoFn.java new file mode 100644 index 0000000000000..a05b0829074af --- /dev/null +++ b/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/ProcessorDoFn.java @@ -0,0 +1,427 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.extensions.ordered; + +import java.util.Iterator; +import javax.annotation.Nullable; +import org.apache.beam.sdk.extensions.ordered.UnprocessedEvent.Reason; +import org.apache.beam.sdk.state.OrderedListState; +import org.apache.beam.sdk.state.Timer; +import org.apache.beam.sdk.state.ValueState; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.values.KV; +import org.apache.beam.sdk.values.TimestampedValue; +import org.apache.beam.sdk.values.TupleTag; +import org.joda.time.Duration; +import org.joda.time.Instant; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Base DoFn for processing ordered events. + * + * @param type of the events to process + * @param event key type + * @param state type + */ +abstract class ProcessorDoFn< + EventT, EventKeyT, ResultT, StateT extends MutableState> + extends DoFn>, KV> { + + private static final Logger LOG = LoggerFactory.getLogger(ProcessorDoFn.class); + + protected static final String PROCESSING_STATE = "processingState"; + protected static final String MUTABLE_STATE = "mutableState"; + + protected static final String STATUS_EMISSION_TIMER = "statusTimer"; + protected static final String WINDOW_CLOSED = "windowClosed"; + protected final EventExaminer eventExaminer; + + private final TupleTag> statusTupleTag; + protected final Duration statusUpdateFrequency; + + protected final TupleTag> mainOutputTupleTag; + protected final TupleTag>>> + unprocessedEventsTupleTag; + private final boolean produceStatusUpdateOnEveryEvent; + + private final long maxNumberOfResultsToProduce; + + protected @Nullable Long numberOfResultsBeforeBundleStart = 0L; + + ProcessorDoFn( + EventExaminer eventExaminer, + TupleTag> mainOutputTupleTag, + TupleTag> statusTupleTag, + Duration statusUpdateFrequency, + TupleTag>>> unprocessedEventTupleTag, + boolean produceStatusUpdateOnEveryEvent, + long maxNumberOfResultsToProduce) { + this.eventExaminer = eventExaminer; + + this.mainOutputTupleTag = mainOutputTupleTag; + this.statusTupleTag = statusTupleTag; + this.unprocessedEventsTupleTag = unprocessedEventTupleTag; + this.statusUpdateFrequency = statusUpdateFrequency; + this.produceStatusUpdateOnEveryEvent = produceStatusUpdateOnEveryEvent; + this.maxNumberOfResultsToProduce = maxNumberOfResultsToProduce; + } + + @StartBundle + public void onBundleStart() { + numberOfResultsBeforeBundleStart = null; + } + + @FinishBundle + public void onBundleFinish() { + // This might be necessary because this field is also used in a Timer + numberOfResultsBeforeBundleStart = null; + } + + /** @return true if each event needs to be examined. */ + abstract boolean checkForFirstOrLastEvent(); + + /** + * Process the just received event. + * + * @return newly created or updated State. If null is returned - the event wasn't processed. + */ + protected @javax.annotation.Nullable StateT processNewEvent( + long currentSequence, + EventT currentEvent, + ProcessingState processingState, + ValueState currentStateState, + OrderedListState bufferedEventsState, + MultiOutputReceiver outputReceiver) { + if (currentSequence == Long.MAX_VALUE) { + // OrderedListState can't handle the timestamp based on MAX_VALUE. + // To avoid exceptions, we DLQ this event. + outputReceiver + .get(unprocessedEventsTupleTag) + .output( + KV.of( + processingState.getKey(), + KV.of( + currentSequence, + UnprocessedEvent.create( + currentEvent, Reason.sequence_id_outside_valid_range)))); + return null; + } + + if (processingState.hasAlreadyBeenProcessed(currentSequence)) { + outputReceiver + .get(unprocessedEventsTupleTag) + .output( + KV.of( + processingState.getKey(), + KV.of(currentSequence, UnprocessedEvent.create(currentEvent, Reason.duplicate)))); + return null; + } + + StateT state; + boolean thisIsTheLastEvent = + checkForFirstOrLastEvent() && eventExaminer.isLastEvent(currentSequence, currentEvent); + if (checkForFirstOrLastEvent() && eventExaminer.isInitialEvent(currentSequence, currentEvent)) { + // First event of the key/window + // What if it's a duplicate event - it will reset everything. Shall we drop/DLQ anything + // that's before the processingState.lastOutputSequence? + state = eventExaminer.createStateOnInitialEvent(currentEvent); + + processingState.eventAccepted(currentSequence, thisIsTheLastEvent); + + ResultT result = state.produceResult(); + if (result != null) { + outputReceiver.get(mainOutputTupleTag).output(KV.of(processingState.getKey(), result)); + processingState.resultProduced(); + } + + // Nothing else to do. We will attempt to process buffered events later. + return state; + } + + if (processingState.isNextEvent(currentSequence)) { + // Event matches expected sequence + state = currentStateState.read(); + if (state == null) { + LOG.warn("Unexpectedly got an empty state. Most likely cause is pipeline drainage."); + return null; + } + + try { + state.mutate(currentEvent); + } catch (Exception e) { + outputReceiver + .get(unprocessedEventsTupleTag) + .output( + KV.of( + processingState.getKey(), + KV.of(currentSequence, UnprocessedEvent.create(currentEvent, e)))); + return null; + } + + ResultT result = state.produceResult(); + if (result != null) { + outputReceiver.get(mainOutputTupleTag).output(KV.of(processingState.getKey(), result)); + processingState.resultProduced(); + } + processingState.eventAccepted(currentSequence, thisIsTheLastEvent); + + return state; + } + + // Event is not ready to be processed yet + bufferEvent( + currentSequence, currentEvent, processingState, bufferedEventsState, thisIsTheLastEvent); + + // This will signal that the state hasn't been mutated. We don't need to save it. + return null; + } + + protected void saveStates( + ValueState> processingStatusState, + ProcessingState processingStatus, + ValueState currentStateState, + @Nullable StateT state, + MultiOutputReceiver outputReceiver, + Instant windowTimestamp) { + // There is always a change to the processing status + processingStatusState.write(processingStatus); + + // Stored state may not have changes if the element was out of sequence. + if (state != null) { + currentStateState.write(state); + } + + if (produceStatusUpdateOnEveryEvent) { + // During pipeline draining the window timestamp is set to a large value in the future. + // Producing an event before that results in error, that's why this logic exist. + Instant statusTimestamp = windowTimestamp; + + emitProcessingStatus(processingStatus, outputReceiver, statusTimestamp); + } + } + + void processStatusTimerEvent( + MultiOutputReceiver outputReceiver, + Timer statusEmissionTimer, + ValueState windowClosedState, + ValueState> processingStateState) { + ProcessingState currentState = processingStateState.read(); + if (currentState == null) { + // This could happen if the state has been purged already during the draining. + // It means that there is nothing that we can do. + LOG.warn( + "Current processing state is null in onStatusEmission() - most likely the pipeline is shutting down."); + return; + } + + emitProcessingStatus(currentState, outputReceiver, Instant.now()); + + Boolean windowClosed = windowClosedState.read(); + if (!currentState.isProcessingCompleted() + // Stop producing statuses if we are finished for a particular key + && (windowClosed == null || !windowClosed)) { + statusEmissionTimer.offset(statusUpdateFrequency).setRelative(); + } + } + + protected void emitProcessingStatus( + ProcessingState processingState, + MultiOutputReceiver outputReceiver, + Instant statusTimestamp) { + if (LOG.isTraceEnabled()) { + LOG.trace("Emitting status for: " + processingState.getKey() + ", " + processingState); + } + outputReceiver + .get(statusTupleTag) + .outputWithTimestamp( + KV.of( + processingState.getKey(), + OrderedProcessingStatus.create( + processingState.getLastOutputSequence(), + processingState.getBufferedEventCount(), + processingState.getEarliestBufferedSequence(), + processingState.getLatestBufferedSequence(), + processingState.getEventsReceived(), + processingState.getResultCount(), + processingState.getDuplicates(), + processingState.isLastEventReceived())), + statusTimestamp); + } + + protected boolean reachedMaxResultCountForBundle( + ProcessingState processingState, Timer largeBatchEmissionTimer) { + boolean exceeded = + processingState.resultsProducedInBundle( + numberOfResultsBeforeBundleStart == null ? 0 : numberOfResultsBeforeBundleStart) + >= maxNumberOfResultsToProduce; + if (exceeded) { + if (LOG.isTraceEnabled()) { + LOG.trace( + "Setting the timer to output next batch of events for key '" + + processingState.getKey() + + "'"); + } + // See GroupIntoBatches for examples on how to hold the timestamp. + // TODO: test that on draining the pipeline all the results are still produced correctly. + // See: https://github.com/apache/beam/issues/30781 + largeBatchEmissionTimer.offset(Duration.millis(1)).setRelative(); + } + return exceeded; + } + + private void bufferEvent( + long currentSequence, + EventT currentEvent, + ProcessingState processingState, + OrderedListState bufferedEventsState, + boolean thisIsTheLastEvent) { + Instant eventTimestamp = fromLong(currentSequence); + bufferedEventsState.add(TimestampedValue.of(currentEvent, eventTimestamp)); + processingState.eventBuffered(currentSequence, thisIsTheLastEvent); + } + + abstract boolean checkForSequenceGapInBufferedEvents(); + + @Nullable + StateT processBufferedEventRange( + ProcessingState processingState, + @Nullable StateT state, + OrderedListState bufferedEventsState, + MultiOutputReceiver outputReceiver, + Timer largeBatchEmissionTimer, + ContiguousSequenceRange contiguousSequenceRange) { + Long earliestBufferedSequence = processingState.getEarliestBufferedSequence(); + Long latestBufferedSequence = processingState.getLatestBufferedSequence(); + if (earliestBufferedSequence == null || latestBufferedSequence == null) { + return state; + } + Instant startRange = fromLong(earliestBufferedSequence); + Instant endRange = fromLong(latestBufferedSequence + 1); + + // readRange is efficiently implemented and will bring records in batches + Iterable> events = bufferedEventsState.readRange(startRange, endRange); + + Instant endClearRange = startRange; // it will get re-adjusted later. + + Iterator> bufferedEventsIterator = events.iterator(); + while (bufferedEventsIterator.hasNext()) { + TimestampedValue timestampedEvent = bufferedEventsIterator.next(); + Instant eventTimestamp = timestampedEvent.getTimestamp(); + long eventSequence = eventTimestamp.getMillis(); + + EventT bufferedEvent = timestampedEvent.getValue(); + boolean skipProcessing = false; + boolean beforeInitialSequence = false; + + if (contiguousSequenceRange != null && eventSequence < contiguousSequenceRange.getStart()) { + // In case of global sequence processing - remove the elements below the range start + skipProcessing = true; + beforeInitialSequence = true; + endClearRange = fromLong(eventSequence); + } + if (processingState.checkForDuplicateBatchedEvent(eventSequence)) { + // There could be multiple events under the same sequence number. Only the first one + // will get processed. The rest are considered duplicates. + skipProcessing = true; + } + + if (skipProcessing) { + outputReceiver + .get(unprocessedEventsTupleTag) + .output( + KV.of( + processingState.getKey(), + KV.of( + eventSequence, + UnprocessedEvent.create( + bufferedEvent, + beforeInitialSequence + ? Reason.before_initial_sequence + : Reason.duplicate)))); + // TODO: When there is a large number of duplicates this can cause a situation where + // we produce too much output and the runner will start throwing unrecoverable errors. + // Need to add counting logic to accumulate both the normal and DLQ outputs. + continue; + } + + Long lastOutputSequence = processingState.getLastOutputSequence(); + boolean currentEventIsNextInSequence = + lastOutputSequence != null && eventSequence == lastOutputSequence + 1; + boolean continueProcessing = + checkForSequenceGapInBufferedEvents() + ? currentEventIsNextInSequence + : (eventSequence < contiguousSequenceRange.getEnd() || currentEventIsNextInSequence); + if (!continueProcessing) { + processingState.foundSequenceGap(eventSequence); + // Records will be cleared up to this element + endClearRange = fromLong(eventSequence); + break; + } + + // This check needs to be done after we checked for sequence gap and before we + // attempt to process the next element which can result in a new result. + if (reachedMaxResultCountForBundle(processingState, largeBatchEmissionTimer)) { + endClearRange = fromLong(eventSequence); + break; + } + + // Remove this record also + endClearRange = fromLong(eventSequence + 1); + + try { + if (state == null) { + if (LOG.isTraceEnabled()) { + LOG.trace("Creating a new state: " + processingState.getKey() + " " + bufferedEvent); + } + state = eventExaminer.createStateOnInitialEvent(bufferedEvent); + } else { + if (LOG.isTraceEnabled()) { + LOG.trace("Mutating " + processingState.getKey() + " " + bufferedEvent); + } + state.mutate(bufferedEvent); + } + } catch (Exception e) { + outputReceiver + .get(unprocessedEventsTupleTag) + .output( + KV.of( + processingState.getKey(), + KV.of(eventSequence, UnprocessedEvent.create(bufferedEvent, e)))); + // There is a chance that the next event will have the same sequence number and will + // process successfully. + continue; + } + + ResultT result = state.produceResult(); + if (result != null) { + outputReceiver.get(mainOutputTupleTag).output(KV.of(processingState.getKey(), result)); + processingState.resultProduced(); + } + processingState.processedBufferedEvent(eventSequence); + } + + bufferedEventsState.clearRange(startRange, endClearRange); + + return state; + } + + static Instant fromLong(long value) { + return Instant.ofEpochMilli(value); + } +} diff --git a/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/SequencePerKeyProcessorDoFn.java b/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/SequencePerKeyProcessorDoFn.java new file mode 100644 index 0000000000000..878a0664ac875 --- /dev/null +++ b/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/SequencePerKeyProcessorDoFn.java @@ -0,0 +1,294 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.extensions.ordered; + +import javax.annotation.Nullable; +import org.apache.beam.sdk.coders.BooleanCoder; +import org.apache.beam.sdk.coders.Coder; +import org.apache.beam.sdk.extensions.ordered.ProcessingState.ProcessingStateCoder; +import org.apache.beam.sdk.state.OrderedListState; +import org.apache.beam.sdk.state.StateSpec; +import org.apache.beam.sdk.state.StateSpecs; +import org.apache.beam.sdk.state.TimeDomain; +import org.apache.beam.sdk.state.Timer; +import org.apache.beam.sdk.state.TimerSpec; +import org.apache.beam.sdk.state.TimerSpecs; +import org.apache.beam.sdk.state.ValueState; +import org.apache.beam.sdk.transforms.windowing.BoundedWindow; +import org.apache.beam.sdk.values.KV; +import org.apache.beam.sdk.values.TupleTag; +import org.joda.time.Duration; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Stateful DoFn to process per key sequences. + * + * @param event type + * @param event key type + * @param result type + * @param state type + */ +class SequencePerKeyProcessorDoFn< + EventTypeT, + EventKeyTypeT, + ResultTypeT, + StateTypeT extends MutableState> + extends ProcessorDoFn { + + private static final Logger LOG = LoggerFactory.getLogger(SequencePerKeyProcessorDoFn.class); + + private static final String LARGE_BATCH_EMISSION_TIMER = "largeBatchTimer"; + protected static final String BUFFERED_EVENTS = "bufferedEvents"; + + @TimerId(LARGE_BATCH_EMISSION_TIMER) + @SuppressWarnings("unused") + private final TimerSpec largeBatchEmissionTimer = TimerSpecs.timer(TimeDomain.EVENT_TIME); + + @StateId(BUFFERED_EVENTS) + @SuppressWarnings("unused") + private final StateSpec> bufferedEventsSpec; + + @SuppressWarnings("unused") + @StateId(MUTABLE_STATE) + private final StateSpec> mutableStateSpec; + + @StateId(WINDOW_CLOSED) + @SuppressWarnings("unused") + private final StateSpec> windowClosedSpec; + + @TimerId(STATUS_EMISSION_TIMER) + @SuppressWarnings("unused") + private final TimerSpec statusEmissionTimer = TimerSpecs.timer(TimeDomain.PROCESSING_TIME); + + @StateId(PROCESSING_STATE) + @SuppressWarnings("unused") + private final StateSpec>> processingStateSpec; + + /** + * Stateful DoFn to do the bulk of processing. + * + * @param eventExaminer + * @param eventCoder + * @param stateCoder + * @param keyCoder + * @param mainOutputTupleTag + * @param statusTupleTag + * @param statusUpdateFrequency + * @param unprocessedEventTupleTag + * @param produceStatusUpdateOnEveryEvent + * @param maxNumberOfResultsToProduce + */ + SequencePerKeyProcessorDoFn( + EventExaminer eventExaminer, + Coder eventCoder, + Coder stateCoder, + Coder keyCoder, + TupleTag> mainOutputTupleTag, + TupleTag> statusTupleTag, + Duration statusUpdateFrequency, + TupleTag>>> unprocessedEventTupleTag, + boolean produceStatusUpdateOnEveryEvent, + long maxNumberOfResultsToProduce) { + super( + eventExaminer, + mainOutputTupleTag, + statusTupleTag, + statusUpdateFrequency, + unprocessedEventTupleTag, + produceStatusUpdateOnEveryEvent, + maxNumberOfResultsToProduce); + this.bufferedEventsSpec = StateSpecs.orderedList(eventCoder); + this.processingStateSpec = StateSpecs.value(ProcessingStateCoder.of(keyCoder)); + this.mutableStateSpec = StateSpecs.value(stateCoder); + this.windowClosedSpec = StateSpecs.value(BooleanCoder.of()); + } + + @Override + boolean checkForFirstOrLastEvent() { + return true; + } + + @Override + boolean checkForSequenceGapInBufferedEvents() { + return true; + } + + @ProcessElement + public void processElement( + @StateId(BUFFERED_EVENTS) OrderedListState bufferedEventsState, + @AlwaysFetched @StateId(PROCESSING_STATE) + ValueState> processingStateState, + @StateId(MUTABLE_STATE) ValueState mutableStateState, + @TimerId(STATUS_EMISSION_TIMER) Timer statusEmissionTimer, + @TimerId(LARGE_BATCH_EMISSION_TIMER) Timer largeBatchEmissionTimer, + @Element KV> eventAndSequence, + MultiOutputReceiver outputReceiver, + BoundedWindow window, + ProcessContext context) { + EventKeyTypeT key = eventAndSequence.getKey(); + long sequence = eventAndSequence.getValue().getKey(); + EventTypeT event = eventAndSequence.getValue().getValue(); + + ProcessingState processingState = processingStateState.read(); + + if (processingState == null) { + // This is the first time we see this key/window pair + processingState = new ProcessingState<>(key); + if (statusUpdateFrequency != null) { + // Set up the timer to produce the status of the processing on a regular basis + statusEmissionTimer.offset(statusUpdateFrequency).setRelative(); + } + } + + if (numberOfResultsBeforeBundleStart == null) { + // Per key processing is synchronized by Beam. There is no need to have it here. + numberOfResultsBeforeBundleStart = processingState.getResultCount(); + } + + processingState.eventReceived(); + + StateTypeT state = + processNewEvent( + sequence, + event, + processingState, + mutableStateState, + bufferedEventsState, + outputReceiver); + + processBufferedEvents( + processingState, state, bufferedEventsState, outputReceiver, largeBatchEmissionTimer); + + saveStates( + processingStateState, + processingState, + mutableStateState, + state, + outputReceiver, + window.maxTimestamp()); + + checkIfProcessingIsCompleted(processingState); + } + + private boolean checkIfProcessingIsCompleted(ProcessingState processingState) { + boolean result = processingState.isProcessingCompleted(); + if (result && LOG.isTraceEnabled()) { + LOG.trace("Processing for key '" + processingState.getKey() + "' is completed."); + } + return result; + } + + /** Process buffered events. */ + private void processBufferedEvents( + ProcessingState processingState, + @Nullable StateTypeT state, + OrderedListState bufferedEventsState, + MultiOutputReceiver outputReceiver, + Timer largeBatchEmissionTimer) { + if (state == null) { + // Only when the current event caused a state mutation and the state is passed to this + // method should we attempt to process buffered events + return; + } + + if (!processingState.readyToProcessBufferedEvents()) { + return; + } + + if (reachedMaxResultCountForBundle(processingState, largeBatchEmissionTimer)) { + // No point in trying to process buffered events + return; + } + + // Technically this block is not needed because these preconditions are checked + // earlier. Included to keep the linter happy. + Long earliestBufferedSequence = processingState.getEarliestBufferedSequence(); + if (earliestBufferedSequence == null) { + return; + } + Long latestBufferedSequence = processingState.getLatestBufferedSequence(); + if (latestBufferedSequence == null) { + return; + } + + processBufferedEventRange( + processingState, + state, + bufferedEventsState, + outputReceiver, + largeBatchEmissionTimer, + ContiguousSequenceRange.EMPTY); + } + + @OnTimer(LARGE_BATCH_EMISSION_TIMER) + public void onBatchEmission( + OnTimerContext context, + @StateId(BUFFERED_EVENTS) OrderedListState bufferedEventsState, + @AlwaysFetched @StateId(PROCESSING_STATE) + ValueState> processingStatusState, + @AlwaysFetched @StateId(MUTABLE_STATE) ValueState currentStateState, + @TimerId(LARGE_BATCH_EMISSION_TIMER) Timer largeBatchEmissionTimer, + MultiOutputReceiver outputReceiver) { + ProcessingState processingState = processingStatusState.read(); + if (processingState == null) { + LOG.warn("Processing state is empty. Ignore it if the pipeline is being cancelled."); + return; + } + StateTypeT state = currentStateState.read(); + if (state == null) { + LOG.warn("Mutable state is empty. Ignore it if the pipeline is being cancelled."); + return; + } + + LOG.debug("Starting to process batch for key '" + processingState.getKey() + "'"); + + this.numberOfResultsBeforeBundleStart = processingState.getResultCount(); + + processBufferedEvents( + processingState, state, bufferedEventsState, outputReceiver, largeBatchEmissionTimer); + + saveStates( + processingStatusState, + processingState, + currentStateState, + state, + outputReceiver, + // TODO: validate that this is correct. + context.window().maxTimestamp()); + + checkIfProcessingIsCompleted(processingState); + } + + @OnTimer(STATUS_EMISSION_TIMER) + @SuppressWarnings("unused") + public void onStatusEmission( + MultiOutputReceiver outputReceiver, + @TimerId(STATUS_EMISSION_TIMER) Timer statusEmissionTimer, + @StateId(WINDOW_CLOSED) ValueState windowClosedState, + @StateId(PROCESSING_STATE) ValueState> processingStateState) { + + processStatusTimerEvent( + outputReceiver, statusEmissionTimer, windowClosedState, processingStateState); + } + + @OnWindowExpiration + public void onWindowExpiration(@StateId(WINDOW_CLOSED) ValueState windowClosedState) { + windowClosedState.write(true); + } +} diff --git a/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/UnprocessedEvent.java b/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/UnprocessedEvent.java index 2131ef384e22f..d7c599277567c 100644 --- a/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/UnprocessedEvent.java +++ b/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/UnprocessedEvent.java @@ -72,7 +72,8 @@ public enum Reason { duplicate, buffered, sequence_id_outside_valid_range, - exception_thrown + exception_thrown, + before_initial_sequence }; public abstract EventT getEvent(); diff --git a/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/combiner/DefaultSequenceCombiner.java b/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/combiner/DefaultSequenceCombiner.java new file mode 100644 index 0000000000000..32e5cbc36e4e6 --- /dev/null +++ b/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/combiner/DefaultSequenceCombiner.java @@ -0,0 +1,122 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.extensions.ordered.combiner; + +import java.util.Iterator; +import java.util.function.BiFunction; +import org.apache.beam.sdk.coders.CannotProvideCoderException; +import org.apache.beam.sdk.coders.Coder; +import org.apache.beam.sdk.coders.CoderRegistry; +import org.apache.beam.sdk.extensions.ordered.ContiguousSequenceRange; +import org.apache.beam.sdk.extensions.ordered.EventExaminer; +import org.apache.beam.sdk.extensions.ordered.MutableState; +import org.apache.beam.sdk.extensions.ordered.combiner.SequenceRangeAccumulator.SequenceRangeAccumulatorCoder; +import org.apache.beam.sdk.transforms.Combine.CombineFn; +import org.apache.beam.sdk.values.KV; +import org.apache.beam.sdk.values.TimestampedValue; +import org.checkerframework.checker.initialization.qual.Initialized; +import org.checkerframework.checker.nullness.qual.NonNull; +import org.checkerframework.checker.nullness.qual.Nullable; +import org.checkerframework.checker.nullness.qual.UnknownKeyFor; +import org.joda.time.Instant; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Default global sequence combiner. + * + *

    Produces the largest {@link ContiguousSequenceRange} of contiguous longs which starts from the + * initial event identified by {@link EventExaminer#isInitialEvent(long, EventT)}. + * + *

    This combiner currently doesn't use {@link EventExaminer#isLastEvent(long, EventT)}. + * + * @param type of key + * @param type of event + * @param type of state + */ +public class DefaultSequenceCombiner> + extends CombineFn< + TimestampedValue>>, + SequenceRangeAccumulator, + ContiguousSequenceRange> { + + private static final Logger LOG = LoggerFactory.getLogger(DefaultSequenceCombiner.class); + + public static final BiFunction<@NonNull Instant, @Nullable Instant, @Nullable Instant> + OLDEST_TIMESTAMP_SELECTOR = + (instant1, instant2) -> { + if (instant2 == null) { + return instant1; + } + @NonNull Instant nonNullableSecondValue = instant2; + return instant1.isAfter(nonNullableSecondValue) ? instant1 : nonNullableSecondValue; + }; + private final EventExaminer eventExaminer; + + public DefaultSequenceCombiner(EventExaminer eventExaminer) { + this.eventExaminer = eventExaminer; + } + + @Override + public SequenceRangeAccumulator createAccumulator() { + return new SequenceRangeAccumulator(); + } + + @Override + public SequenceRangeAccumulator addInput( + SequenceRangeAccumulator accum, TimestampedValue>> event) { + long sequence = event.getValue().getValue().getKey(); + + accum.add( + sequence, + event.getTimestamp(), + eventExaminer.isInitialEvent(sequence, event.getValue().getValue().getValue())); + + return accum; + } + + @Override + public SequenceRangeAccumulator mergeAccumulators( + Iterable accumulators) { + // There should be at least one accumulator. + Iterator iterator = accumulators.iterator(); + SequenceRangeAccumulator result = iterator.next(); + while (iterator.hasNext()) { + result.merge(iterator.next()); + } + return result; + } + + @Override + public ContiguousSequenceRange extractOutput(SequenceRangeAccumulator accum) { + ContiguousSequenceRange result = accum.largestContinuousRange(); + if (LOG.isTraceEnabled()) { + LOG.trace("Returning completed sequence range: " + result); + } + return result; + } + + @Override + public @UnknownKeyFor @NonNull @Initialized Coder getAccumulatorCoder( + @UnknownKeyFor @NonNull @Initialized CoderRegistry registry, + @UnknownKeyFor @NonNull @Initialized + Coder>>> inputCoder) + throws @UnknownKeyFor @NonNull @Initialized CannotProvideCoderException { + return SequenceRangeAccumulatorCoder.of(); + } +} diff --git a/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/combiner/SequenceRangeAccumulator.java b/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/combiner/SequenceRangeAccumulator.java new file mode 100644 index 0000000000000..89dc912afc90c --- /dev/null +++ b/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/combiner/SequenceRangeAccumulator.java @@ -0,0 +1,296 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.extensions.ordered.combiner; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.Map.Entry; +import java.util.Objects; +import java.util.SortedMap; +import java.util.TreeMap; +import javax.annotation.Nullable; +import org.apache.beam.sdk.coders.CoderException; +import org.apache.beam.sdk.coders.CustomCoder; +import org.apache.beam.sdk.coders.NullableCoder; +import org.apache.beam.sdk.coders.VarIntCoder; +import org.apache.beam.sdk.coders.VarLongCoder; +import org.apache.beam.sdk.extensions.ordered.ContiguousSequenceRange; +import org.apache.commons.lang3.tuple.Pair; +import org.checkerframework.checker.initialization.qual.Initialized; +import org.checkerframework.checker.nullness.qual.NonNull; +import org.checkerframework.checker.nullness.qual.UnknownKeyFor; +import org.joda.time.Instant; + +/** Default accumulator used to combine sequence ranges. */ +public class SequenceRangeAccumulator { + + private static Instant max(Instant a, Instant b) { + return a.isAfter(b) ? a : b; + } + + /** + * The tree contains a set of non-overlapping contiguous ranges, where the key is the lower + * inclusive start of the range, left value of the pair is the inclusive end of the range and the + * right value of the pair is the maximum timestamp in the range. + * + *

    The maximum timestamp is critical for the correctness of the ordered processing. During the + * merge process the merged range is assigned the maximum timestamp of the two ranges that created + * this new range. + */ + private final TreeMap> data = new TreeMap<>(); + + private @Nullable Long initialSequence = null; + + public void add(long sequence, Instant timestamp, boolean isInitialSequence) { + if (isInitialSequence && this.initialSequence != null && sequence != this.initialSequence) { + throw new IllegalStateException( + "There are different initial sequences detected: " + + initialSequence + + " and " + + sequence); + } + + if (sequence == Long.MAX_VALUE) { + // This is an invalid value and DoFns will not process this element. This will also allow + // to produce a ContiguousSequenceRange with the exclusive end value. + return; + } + + if (isInitialSequence) { + this.initialSequence = sequence; + clearRangesBelowInitialSequence(sequence, timestamp); + } else if (initialSequence != null && sequence <= initialSequence) { + // No need to add anything lower than the initial sequence to the accumulator. + return; + } + + long lowerBound = sequence, upperBound = sequence; + + Entry> lowerRange = data.floorEntry(sequence); + if (lowerRange != null) { + long inclusiveUpperBoundary = lowerRange.getValue().getLeft(); + if (sequence <= inclusiveUpperBoundary) { + // Duplicate. No need to adjust the timestamp. + return; + } + + if (inclusiveUpperBoundary + 1 == sequence) { + // The new element extends the lower range. Remove the range. + timestamp = max(timestamp, lowerRange.getValue().getValue()); + lowerBound = lowerRange.getKey(); + data.remove(lowerRange.getKey()); + } + } + + long nextSequenceNumber = sequence + 1; + Pair upperRange = data.get(nextSequenceNumber); + if (upperRange != null) { + // The new element will extend the upper range. Remove the range. + timestamp = max(timestamp, upperRange.getRight()); + upperBound = upperRange.getLeft(); + data.remove(nextSequenceNumber); + } + + data.put(lowerBound, Pair.of(upperBound, timestamp)); + } + + private void clearRangesBelowInitialSequence(long sequence, Instant timestamp) { + // First, adjust the current range, if any + Entry> lowerRange = data.floorEntry(sequence); + if (lowerRange != null + && lowerRange.getKey() < sequence + && lowerRange.getValue().getLeft() > sequence) { + // The sequence is in the middle of the range. Adjust it. + data.remove(lowerRange.getKey()); + data.put( + sequence, + Pair.of( + lowerRange.getValue().getKey(), max(timestamp, lowerRange.getValue().getValue()))); + } + data.subMap(Long.MIN_VALUE, sequence).clear(); + } + + public ContiguousSequenceRange largestContinuousRange() { + if (initialSequence == null) { + return ContiguousSequenceRange.EMPTY; + } + + Entry> firstEntry = data.firstEntry(); + if (firstEntry == null) { + throw new IllegalStateException("First entry is null when initial sequence is set."); + } + Long start = firstEntry.getKey(); + Long end = firstEntry.getValue().getLeft(); + Instant latestTimestamp = firstEntry.getValue().getRight(); + // Upper bound is inclusive, but the ContiguousSequenceRange's end is exclusive. + // The numeric overflow is prevented by dropping the value of Long.MAX. + return ContiguousSequenceRange.of(start, end + 1, latestTimestamp); + } + + public int numberOfRanges() { + return data.size(); + } + + public void merge(SequenceRangeAccumulator another) { + if (this.initialSequence != null + && another.initialSequence != null + && !this.initialSequence.equals(another.initialSequence)) { + throw new IllegalStateException( + "Two accumulators contain different initial sequences: " + + this.initialSequence + + " and " + + another.initialSequence); + } + + if (another.initialSequence != null) { + long newInitialSequence = another.initialSequence; + this.initialSequence = newInitialSequence; + Entry> firstEntry = another.data.firstEntry(); + if (firstEntry != null) { + Instant timestampOfTheInitialRange = firstEntry.getValue().getRight(); + clearRangesBelowInitialSequence(newInitialSequence, timestampOfTheInitialRange); + } + } + + another + .data + .entrySet() + .forEach( + entry -> { + long lowerBound = entry.getKey(); + long upperBound = entry.getValue().getLeft(); + if (this.initialSequence != null) { + if (upperBound < initialSequence) { + // The whole range is below the initial sequence. Ignore it. + return; + } + if (lowerBound < initialSequence) { + // This will cause pruning of the range up to the initial sequence + lowerBound = this.initialSequence; + } + } + + Entry> lowerRange = this.data.floorEntry(lowerBound); + + if (lowerRange != null) { + if (lowerRange.getValue().getLeft() < lowerBound - 1) { + // Nothing to do. There is a lower non-adjacent range. + } else { + // We found an overlapping range and will replace it with a new one + upperBound = Math.max(upperBound, lowerRange.getValue().getLeft()); + lowerBound = lowerRange.getKey(); + } + } + + Entry> upperRange = this.data.floorEntry(upperBound + 1); + if (upperRange == null + || (lowerRange != null + && Objects.equals(upperRange.getKey(), lowerRange.getKey()))) { + // Nothing to do - either there is no adjacent upper range or it equals the lower + // range + } else { + upperBound = Math.max(upperBound, upperRange.getValue().getLeft()); + } + + Instant latestTimestamp = + removeAllRanges(lowerBound, upperBound, entry.getValue().getRight()); + + this.data.put(lowerBound, Pair.of(upperBound, latestTimestamp)); + }); + } + + private Instant removeAllRanges(long lowerBound, long upperBound, Instant currentTimestamp) { + Instant result = currentTimestamp; + SortedMap> rangesToRemove = data.subMap(lowerBound, upperBound); + for (Pair value : rangesToRemove.values()) { + result = result.isAfter(value.getRight()) ? result : value.getRight(); + } + rangesToRemove.clear(); + return result; + } + + @Override + public boolean equals(@Nullable Object o) { + if (this == o) { + return true; + } + if (!(o instanceof SequenceRangeAccumulator)) { + return false; + } + SequenceRangeAccumulator that = (SequenceRangeAccumulator) o; + return data.equals(that.data) && Objects.equals(initialSequence, that.initialSequence); + } + + @Override + public int hashCode() { + return Objects.hash(data, initialSequence); + } + + @Override + public String toString() { + return "SequenceRangeAccumulator{initialSequence=" + initialSequence + ", data=" + data + '}'; + } + + public static class SequenceRangeAccumulatorCoder extends CustomCoder { + + private static final SequenceRangeAccumulatorCoder INSTANCE = + new SequenceRangeAccumulatorCoder(); + + public static SequenceRangeAccumulatorCoder of() { + return INSTANCE; + } + + private SequenceRangeAccumulatorCoder() {} + + private final NullableCoder initialSequenceCoder = NullableCoder.of(VarLongCoder.of()); + private final VarIntCoder numberOfRangesCoder = VarIntCoder.of(); + private final VarLongCoder dataCoder = VarLongCoder.of(); + + @Override + public void encode( + SequenceRangeAccumulator value, @UnknownKeyFor @NonNull @Initialized OutputStream outStream) + throws @UnknownKeyFor @NonNull @Initialized CoderException, @UnknownKeyFor @NonNull + @Initialized IOException { + numberOfRangesCoder.encode(value.numberOfRanges(), outStream); + initialSequenceCoder.encode(value.initialSequence, outStream); + for (Entry> entry : value.data.entrySet()) { + dataCoder.encode(entry.getKey(), outStream); + dataCoder.encode(entry.getValue().getLeft(), outStream); + dataCoder.encode(entry.getValue().getRight().getMillis(), outStream); + } + } + + @Override + public SequenceRangeAccumulator decode( + @UnknownKeyFor @NonNull @Initialized InputStream inStream) + throws @UnknownKeyFor @NonNull @Initialized CoderException, @UnknownKeyFor @NonNull + @Initialized IOException { + SequenceRangeAccumulator result = new SequenceRangeAccumulator(); + int numberOfRanges = numberOfRangesCoder.decode(inStream); + result.initialSequence = initialSequenceCoder.decode(inStream); + for (int i = 0; i < numberOfRanges; i++) { + long key = dataCoder.decode(inStream); + long upperBound = dataCoder.decode(inStream); + long millis = dataCoder.decode(inStream); + result.data.put(key, Pair.of(upperBound, Instant.ofEpochMilli(millis))); + } + return result; + } + } +} diff --git a/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/combiner/package-info.java b/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/combiner/package-info.java new file mode 100644 index 0000000000000..0d730d55fb9f8 --- /dev/null +++ b/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/combiner/package-info.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * Default implementation of the global sequence combiner used by {@link + * org.apache.beam.sdk.extensions.ordered.OrderedEventProcessor} when processing events using global + * sequences. + */ +package org.apache.beam.sdk.extensions.ordered.combiner; diff --git a/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/package-info.java b/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/package-info.java index f9d7e3d67bff1..4cbbca82a8cfd 100644 --- a/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/package-info.java +++ b/sdks/java/extensions/ordered/src/main/java/org/apache/beam/sdk/extensions/ordered/package-info.java @@ -16,7 +16,9 @@ * limitations under the License. */ /** - * Provides a transform for ordered processing. + * Provides a transform for ordered processing. For a detailed reference implementation which uses + * this transform visit {@link https://github.com/GoogleCloudPlatform/dataflow-ordered-processing} * * @see org.apache.beam.sdk.extensions.ordered.OrderedEventProcessor */ diff --git a/sdks/java/extensions/ordered/src/test/java/org/apache/beam/sdk/extensions/ordered/OrderedEventProcessorGlobalSequenceTest.java b/sdks/java/extensions/ordered/src/test/java/org/apache/beam/sdk/extensions/ordered/OrderedEventProcessorGlobalSequenceTest.java new file mode 100644 index 0000000000000..98bc7591f4d7a --- /dev/null +++ b/sdks/java/extensions/ordered/src/test/java/org/apache/beam/sdk/extensions/ordered/OrderedEventProcessorGlobalSequenceTest.java @@ -0,0 +1,534 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.extensions.ordered; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; +import java.util.logging.Level; +import java.util.logging.Logger; +import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.coders.CannotProvideCoderException; +import org.apache.beam.sdk.extensions.ordered.StringBufferOrderedProcessingHandler.StringBufferOrderedProcessingWithGlobalSequenceHandler; +import org.apache.beam.sdk.extensions.ordered.UnprocessedEvent.Reason; +import org.apache.beam.sdk.testing.PAssert; +import org.apache.beam.sdk.testing.TestStream; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.transforms.windowing.FixedWindows; +import org.apache.beam.sdk.transforms.windowing.IntervalWindow; +import org.apache.beam.sdk.transforms.windowing.Window; +import org.apache.beam.sdk.values.KV; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.TimestampedValue; +import org.joda.time.Duration; +import org.joda.time.Instant; +import org.junit.Test; + +public class OrderedEventProcessorGlobalSequenceTest extends OrderedEventProcessorTestBase { + + public static final boolean GLOBAL_SEQUENCE = true; + + static { + Logger logger = Logger.getLogger(GlobalSequencesProcessorDoFn.class.getName()); + logger.setLevel(Level.FINEST); + } + + @org.junit.Test + public void testPerfectOrderingProcessing() throws CannotProvideCoderException { + Event[] events = { + Event.create(0, "id-1", "a"), + Event.create(1, "id-1", "b"), + Event.create(2, "id-1", "c"), + Event.create(3, "id-1", "d"), + Event.create(4, "id-2", "a"), + Event.create(5, "id-2", "b") + }; + + Collection> expectedOutput = new ArrayList<>(); + expectedOutput.add(KV.of("id-1", "a")); + expectedOutput.add(KV.of("id-1", "ab")); + expectedOutput.add(KV.of("id-1", "abc")); + expectedOutput.add(KV.of("id-1", "abcd")); + expectedOutput.add(KV.of("id-2", "a")); + expectedOutput.add(KV.of("id-2", "ab")); + + testGlobalSequenceProcessing( + events, + expectedOutput, + EMISSION_FREQUENCY_ON_EVERY_ELEMENT, + INITIAL_SEQUENCE_OF_0, + LARGE_MAX_RESULTS_PER_OUTPUT, + ContiguousSequenceRange.of(0, 6, new Instant())); + } + + @Test + public void testOutOfSequenceProcessing() throws CannotProvideCoderException { + Event[] events = { + Event.create(2, "id-1", "c"), + Event.create(1, "id-1", "b"), + Event.create(0, "id-1", "a"), + Event.create(3, "id-1", "d"), + Event.create(5, "id-2", "b"), + Event.create(6, "id-2", "c"), + Event.create(8, "id-2", "e"), + Event.create(4, "id-2", "a"), + Event.create(7, "id-2", "d") + }; + + Collection> expectedOutput = new ArrayList<>(); + expectedOutput.add(KV.of("id-1", "a")); + expectedOutput.add(KV.of("id-1", "ab")); + expectedOutput.add(KV.of("id-1", "abc")); + expectedOutput.add(KV.of("id-1", "abcd")); + expectedOutput.add(KV.of("id-2", "a")); + expectedOutput.add(KV.of("id-2", "ab")); + expectedOutput.add(KV.of("id-2", "abc")); + expectedOutput.add(KV.of("id-2", "abcd")); + expectedOutput.add(KV.of("id-2", "abcde")); + + testGlobalSequenceProcessing( + events, + expectedOutput, + EMISSION_FREQUENCY_ON_EVERY_ELEMENT, + INITIAL_SEQUENCE_OF_0, + LARGE_MAX_RESULTS_PER_OUTPUT, + ContiguousSequenceRange.of(0, 9, new Instant())); + } + + @Test + public void testHandlingOfDuplicateSequences() throws CannotProvideCoderException { + Event[] events = { + Event.create(3, "id-1", "d"), + Event.create(2, "id-1", "c"), + + // Duplicates + Event.create(3, "id-1", "d"), + Event.create(3, "id-1", "d"), + Event.create(0, "id-1", "a"), + Event.create(1, "id-1", "b"), + + // Additional duplicates + Event.create(1, "id-1", "b"), + Event.create(3, "id-1", "d"), + }; + + Collection> expectedOutput = new ArrayList<>(); + expectedOutput.add(KV.of("id-1", "a")); + expectedOutput.add(KV.of("id-1", "ab")); + expectedOutput.add(KV.of("id-1", "abc")); + expectedOutput.add(KV.of("id-1", "abcd")); + + Collection>>> duplicates = new ArrayList<>(); + duplicates.add(KV.of("id-1", KV.of(3L, UnprocessedEvent.create("d", Reason.duplicate)))); + duplicates.add(KV.of("id-1", KV.of(3L, UnprocessedEvent.create("d", Reason.duplicate)))); + duplicates.add(KV.of("id-1", KV.of(1L, UnprocessedEvent.create("b", Reason.duplicate)))); + duplicates.add(KV.of("id-1", KV.of(3L, UnprocessedEvent.create("d", Reason.duplicate)))); + + testGlobalSequenceProcessing( + events, + expectedOutput, + duplicates, + EMISSION_FREQUENCY_ON_EVERY_ELEMENT, + INITIAL_SEQUENCE_OF_0, + LARGE_MAX_RESULTS_PER_OUTPUT, + ContiguousSequenceRange.of(0, 4, new Instant())); + } + + @Test + public void testTreatingSequencesBelowInitialAsDuplicates() throws CannotProvideCoderException { + Event[] events = { + Event.create(3, "id-1", "d"), + Event.create(2, "id-1", "c"), + + // Earlier events + Event.create(-1, "id-1", "early-1"), + Event.create(-2, "id-1", "early-2"), + Event.create(0, "id-1", "a"), + Event.create(1, "id-1", "b") + }; + + Collection> expectedOutput = new ArrayList<>(); + expectedOutput.add(KV.of("id-1", "a")); + expectedOutput.add(KV.of("id-1", "ab")); + expectedOutput.add(KV.of("id-1", "abc")); + expectedOutput.add(KV.of("id-1", "abcd")); + + Collection>>> duplicates = new ArrayList<>(); + duplicates.add( + KV.of( + "id-1", + KV.of(-1L, UnprocessedEvent.create("early-1", Reason.before_initial_sequence)))); + duplicates.add( + KV.of( + "id-1", + KV.of(-2L, UnprocessedEvent.create("early-2", Reason.before_initial_sequence)))); + + testGlobalSequenceProcessing( + events, + expectedOutput, + duplicates, + EMISSION_FREQUENCY_ON_EVERY_ELEMENT, + INITIAL_SEQUENCE_OF_0, + LARGE_MAX_RESULTS_PER_OUTPUT, + ContiguousSequenceRange.of(0, 4, new Instant())); + } + + @Test + public void testHandlingOfCheckedExceptions() throws CannotProvideCoderException { + Event[] events = { + Event.create(0, "id-1", "a"), + Event.create(1, "id-1", "b"), + Event.create(2, "id-1", StringBuilderState.BAD_VALUE), + Event.create(3, "id-1", "c"), + }; + + // This is an interesting case - even though event #2 is not processed it doesn't affect + // the global sequence calculations. It is not considered a gap, and all the subsequent + // events will be processed. + Collection> expectedOutput = new ArrayList<>(); + expectedOutput.add(KV.of("id-1", "a")); + expectedOutput.add(KV.of("id-1", "ab")); + expectedOutput.add(KV.of("id-1", "abc")); + + Collection>>> failedEvents = new ArrayList<>(); + failedEvents.add( + KV.of( + "id-1", + KV.of( + 2L, + UnprocessedEvent.create(StringBuilderState.BAD_VALUE, Reason.exception_thrown)))); + + testGlobalSequenceProcessing( + events, + expectedOutput, + failedEvents, + EMISSION_FREQUENCY_ON_EVERY_ELEMENT, + INITIAL_SEQUENCE_OF_0, + LARGE_MAX_RESULTS_PER_OUTPUT, + // Sequence matcher doesn't know if the element is valid or not. + // That's why the elements that are get rejected in the processor still count when + // calculating the global sequence + ContiguousSequenceRange.of(0, 4, new Instant())); + } + + @Test + public void testProcessingWithEveryOtherResultEmission() throws CannotProvideCoderException { + Event[] events = { + Event.create(2, "id-1", "c"), + Event.create(1, "id-1", "b"), + Event.create(0, "id-1", "a"), + Event.create(3, "id-1", "d"), + Event.create(4, "id-2", "a"), + Event.create(5, "id-2", "b"), + }; + + Collection> expectedOutput = new ArrayList<>(); + expectedOutput.add(KV.of("id-1", "a")); + // Skipped KV.of("id-1", "ab"), + expectedOutput.add(KV.of("id-1", "abc")); + // Skipped KV.of("id-1", "abcd"), + expectedOutput.add(KV.of("id-2", "a")); + // Skipped KV.of("id-2", "ab") + testGlobalSequenceProcessing( + events, + expectedOutput, + EMISSION_FREQUENCY_ON_EVERY_OTHER_EVENT, + INITIAL_SEQUENCE_OF_0, + LARGE_MAX_RESULTS_PER_OUTPUT, + ContiguousSequenceRange.of(0, 6, new Instant())); + } + + @Test + public void testLargeBufferedOutputInTimer() throws CannotProvideCoderException { + int maxResultsPerOutput = 100; + + // Array of sequences starting with 2 and the last element - 1. + // Output will be buffered until the last event arrives + long[] sequences = new long[maxResultsPerOutput * 3]; + for (int i = 0; i < sequences.length - 1; i++) { + sequences[i] = i + 2L; + } + sequences[sequences.length - 1] = 1; + + List events = new ArrayList<>(sequences.length); + Collection> expectedOutput = new ArrayList<>(sequences.length); + + StringBuilder output = new StringBuilder(); + String outputPerElement = "."; + String key = "id-1"; + + for (long sequence : sequences) { + events.add(Event.create(sequence, key, outputPerElement)); + output.append(outputPerElement); + expectedOutput.add(KV.of(key, output.toString())); + } + + testGlobalSequenceProcessing( + events.toArray(new Event[events.size()]), + expectedOutput, + EMISSION_FREQUENCY_ON_EVERY_ELEMENT, + 1L /* This dataset assumes 1 as the starting sequence */, + maxResultsPerOutput, + ContiguousSequenceRange.of(1, sequences.length + 1, new Instant())); + } + + @Test + public void testSequenceGapProcessingInBufferedOutput() throws CannotProvideCoderException { + int maxResultsPerOutput = 3; + + long[] sequences = new long[] {2, 3, 7, 8, 9, 10, 1, 4, 5, 6}; + + List events = new ArrayList<>(sequences.length); + List> expectedOutput = new ArrayList<>(sequences.length); + + String key = "id-1"; + + for (long sequence : sequences) { + events.add(Event.create(sequence, key, sequence + "-")); + } + + StringBuilder output = new StringBuilder(); + Arrays.stream(sequences) + .sorted() + .forEach( + sequence -> { + output.append(sequence + "-"); + expectedOutput.add(KV.of(key, output.toString())); + }); + + testGlobalSequenceProcessing( + events.toArray(new Event[events.size()]), + expectedOutput, + EMISSION_FREQUENCY_ON_EVERY_ELEMENT, + 1L /* This dataset assumes 1 as the starting sequence */, + maxResultsPerOutput, + ContiguousSequenceRange.of(1, 11, new Instant())); + } + + @Test + public void testHandlingOfMaxSequenceNumber() throws CannotProvideCoderException { + Event[] events = { + Event.create(1, "id-1", "b"), + Event.create(0, "id-1", "a"), + Event.create(Long.MAX_VALUE, "id-1", "d"), + Event.create(2, "id-1", "c") + }; + + Collection> expectedOutput = new ArrayList<>(); + expectedOutput.add(KV.of("id-1", "a")); + expectedOutput.add(KV.of("id-1", "ab")); + expectedOutput.add(KV.of("id-1", "abc")); + + Collection>>> unprocessedEvents = + new ArrayList<>(); + unprocessedEvents.add( + KV.of( + "id-1", + KV.of( + Long.MAX_VALUE, + UnprocessedEvent.create("d", Reason.sequence_id_outside_valid_range)))); + + testGlobalSequenceProcessing( + events, + expectedOutput, + unprocessedEvents, + EMISSION_FREQUENCY_ON_EVERY_ELEMENT, + INITIAL_SEQUENCE_OF_0, + LARGE_MAX_RESULTS_PER_OUTPUT, + ContiguousSequenceRange.of(0, 3, Instant.now())); + } + + @Test + public void testProcessingOfTheLastInput() throws CannotProvideCoderException { + // TODO: fix the test. Need to see that the resulting status reflects the last input + Event[] events = { + Event.create(0, "id-1", "a"), + Event.create(1, "id-1", "b"), + Event.create(2, "id-1", StringEventExaminer.LAST_INPUT) + }; + + Collection> expectedOutput = new ArrayList<>(); + expectedOutput.add(KV.of("id-1", "a")); + expectedOutput.add(KV.of("id-1", "ab")); + expectedOutput.add(KV.of("id-1", "ab" + StringEventExaminer.LAST_INPUT)); + + testGlobalSequenceProcessing( + events, + expectedOutput, + EMISSION_FREQUENCY_ON_EVERY_ELEMENT, + INITIAL_SEQUENCE_OF_0, + LARGE_MAX_RESULTS_PER_OUTPUT, + ContiguousSequenceRange.of(0, 3, new Instant())); + } + + private void testGlobalSequenceProcessing( + Event[] events, + Collection> expectedOutput, + int emissionFrequency, + long initialSequence, + int maxResultsPerOutput, + ContiguousSequenceRange expectedLastCompleteRange) + throws CannotProvideCoderException { + testGlobalSequenceProcessing( + events, + expectedOutput, + NO_EXPECTED_DLQ_EVENTS, + emissionFrequency, + initialSequence, + maxResultsPerOutput, + expectedLastCompleteRange); + } + + private void testGlobalSequenceProcessing( + Event[] events, + Collection> expectedOutput, + Collection>>> expectedUnprocessedEvents, + int emissionFrequency, + long initialSequence, + int maxResultsPerOutput, + ContiguousSequenceRange expectedLastCompleteRange) + throws CannotProvideCoderException { + // Test a streaming pipeline + doTest( + events, + null /* expectedStatuses */, + expectedOutput, + expectedUnprocessedEvents, + emissionFrequency, + initialSequence, + maxResultsPerOutput, + false /* produceStatusOnEveryEvent */, + STREAMING, + GLOBAL_SEQUENCE, + expectedLastCompleteRange); + + // Test a batch pipeline + if (runTestsOnDataflowRunner()) { + doTest( + events, + null /* expectedStatuses */, + expectedOutput, + expectedUnprocessedEvents, + emissionFrequency, + initialSequence, + maxResultsPerOutput, + false /* produceStatusOnEveryEvent */, + BATCH, + GLOBAL_SEQUENCE, + expectedLastCompleteRange); + } else { + System.err.println( + "Warning - batch tests didn't run. " + + "DirectRunner doesn't work correctly with this transform in batch mode." + + "Run the tests using Dataflow runner to validate."); + } + } + + @Test + public void testWindowedProcessing() throws CannotProvideCoderException { + + Instant base = new Instant(0); + TestStream values = + TestStream.create(streamingPipeline.getCoderRegistry().getCoder(Event.class)) + .advanceWatermarkTo(base) + .addElements( + // Start of first window + TimestampedValue.of( + Event.create(0, "id-1", "a"), base.plus(Duration.standardSeconds(1))), + TimestampedValue.of( + Event.create(1, "id-1", "b"), base.plus(Duration.standardSeconds(2))), + TimestampedValue.of( + Event.create(0, "id-2", "x"), base.plus(Duration.standardSeconds(1))), + TimestampedValue.of( + Event.create(1, "id-2", "y"), base.plus(Duration.standardSeconds(2))), + TimestampedValue.of( + Event.create(2, "id-2", "z"), base.plus(Duration.standardSeconds(2))), + + // Start of second window. Numbering must start with 0 again. + TimestampedValue.of( + Event.create(0, "id-1", "c"), base.plus(Duration.standardSeconds(10))), + TimestampedValue.of( + Event.create(1, "id-1", "d"), base.plus(Duration.standardSeconds(11)))) + .advanceProcessingTime(Duration.standardMinutes(15)) + .advanceWatermarkToInfinity(); + + Pipeline pipeline = streamingPipeline; + + PCollection rawInput = pipeline.apply("Create Streaming Events", values); + PCollection>> input = + rawInput.apply("To KV", ParDo.of(new MapEventsToKV())); + + input = input.apply("Window input", Window.into(FixedWindows.of(Duration.standardSeconds(5)))); + + StringBufferOrderedProcessingWithGlobalSequenceHandler handler = + new StringBufferOrderedProcessingWithGlobalSequenceHandler( + EMISSION_FREQUENCY_ON_EVERY_ELEMENT, INITIAL_SEQUENCE_OF_0); + handler.setMaxOutputElementsPerBundle(LARGE_MAX_RESULTS_PER_OUTPUT); + handler.setStatusUpdateFrequency(null); + handler.setProduceStatusUpdateOnEveryEvent(false); + + OrderedEventProcessor orderedEventProcessor = + OrderedEventProcessor.create(handler); + + OrderedEventProcessorResult processingResult = + input.apply("Process Events", orderedEventProcessor); + + IntervalWindow window1 = new IntervalWindow(base, base.plus(Duration.standardSeconds(5))); + PAssert.that("Output matches in window 1", processingResult.output()) + .inWindow(window1) + .containsInAnyOrder( + KV.of("id-1", "a"), + KV.of("id-1", "ab"), + KV.of("id-2", "x"), + KV.of("id-2", "xy"), + KV.of("id-2", "xyz")); + + IntervalWindow window2 = + new IntervalWindow( + base.plus(Duration.standardSeconds(10)), base.plus(Duration.standardSeconds(15))); + PAssert.that("Output matches in window 2", processingResult.output()) + .inWindow(window2) + .containsInAnyOrder(KV.of("id-1", "c"), KV.of("id-1", "cd")); + + // TODO: can we make the status assertions work? + // PAssert.that("Statuses match in window 1", processingResult.processingStatuses()) + // .inWindow(window1) + // .containsInAnyOrder( + //// KV.of("id-1", OrderedProcessingStatus.create(0L, 0, null, null, 1, 1, 0, + // false)), + // KV.of("id-1", OrderedProcessingStatus.create(1L, 0, null, null, 2, 2, 0, false)), + //// KV.of("id-2", OrderedProcessingStatus.create(0L, 0, null, null, 1, 1, 0, + // false)), + //// KV.of("id-2", OrderedProcessingStatus.create(1L, 0, null, null, 2, 2, 0, + // false)), + // KV.of("id-2", OrderedProcessingStatus.create(2L, 0, null, null, 3, 3, 0, false)) + // ); + + // PAssert.that("Statuses match in window 2", processingResult.processingStatuses()) + // .inWindow(window2) + // .containsInAnyOrder( + // KV.of("id-1", OrderedProcessingStatus.create(0L, 0, null, null, 1, 1, 0, false)), + // KV.of("id-1", OrderedProcessingStatus.create(1L, 0, null, null, 2, 2, 0, false))); + + PAssert.that("Unprocessed events match", processingResult.unprocessedEvents()) + .containsInAnyOrder(NO_EXPECTED_DLQ_EVENTS); + + pipeline.run(); + } +} diff --git a/sdks/java/extensions/ordered/src/test/java/org/apache/beam/sdk/extensions/ordered/OrderedEventProcessorTest.java b/sdks/java/extensions/ordered/src/test/java/org/apache/beam/sdk/extensions/ordered/OrderedEventProcessorPerKeySequenceTest.java similarity index 71% rename from sdks/java/extensions/ordered/src/test/java/org/apache/beam/sdk/extensions/ordered/OrderedEventProcessorTest.java rename to sdks/java/extensions/ordered/src/test/java/org/apache/beam/sdk/extensions/ordered/OrderedEventProcessorPerKeySequenceTest.java index 6a24021ad667d..6909a3bb992c1 100644 --- a/sdks/java/extensions/ordered/src/test/java/org/apache/beam/sdk/extensions/ordered/OrderedEventProcessorTest.java +++ b/sdks/java/extensions/ordered/src/test/java/org/apache/beam/sdk/extensions/ordered/OrderedEventProcessorPerKeySequenceTest.java @@ -20,82 +20,24 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; -import java.util.Collections; import java.util.List; -import java.util.Set; import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.coders.CannotProvideCoderException; import org.apache.beam.sdk.extensions.ordered.UnprocessedEvent.Reason; import org.apache.beam.sdk.testing.PAssert; -import org.apache.beam.sdk.testing.SerializableMatcher; -import org.apache.beam.sdk.testing.TestPipeline; import org.apache.beam.sdk.testing.TestStream; -import org.apache.beam.sdk.transforms.Count; -import org.apache.beam.sdk.transforms.Create; -import org.apache.beam.sdk.transforms.DoFn; import org.apache.beam.sdk.transforms.ParDo; -import org.apache.beam.sdk.transforms.Reshuffle; -import org.apache.beam.sdk.transforms.windowing.AfterWatermark; import org.apache.beam.sdk.transforms.windowing.FixedWindows; -import org.apache.beam.sdk.transforms.windowing.GlobalWindows; import org.apache.beam.sdk.transforms.windowing.IntervalWindow; -import org.apache.beam.sdk.transforms.windowing.Repeatedly; import org.apache.beam.sdk.transforms.windowing.Window; import org.apache.beam.sdk.values.KV; import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.TimestampedValue; -import org.checkerframework.checker.initialization.qual.Initialized; -import org.checkerframework.checker.nullness.qual.NonNull; -import org.checkerframework.checker.nullness.qual.UnknownKeyFor; -import org.hamcrest.BaseMatcher; -import org.hamcrest.Description; import org.joda.time.Duration; import org.joda.time.Instant; -import org.junit.Rule; import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.JUnit4; -/** - * Ordered Processing tests use the same testing scenario. Events are sent in or out of sequence. - * Each event is a string for a particular key. The output is a concatenation of all strings. - */ -@RunWith(JUnit4.class) -public class OrderedEventProcessorTest { - - public static final boolean LAST_EVENT_RECEIVED = true; - public static final int EMISSION_FREQUENCY_ON_EVERY_ELEMENT = 1; - public static final int INITIAL_SEQUENCE_OF_0 = 0; - public static final boolean DONT_PRODUCE_STATUS_ON_EVERY_EVENT = false; - public static final int LARGE_MAX_RESULTS_PER_OUTPUT = 1000; - public static final int EMISSION_FREQUENCY_ON_EVERY_OTHER_EVENT = 2; - public static final boolean PRODUCE_STATUS_ON_EVERY_EVENT = true; - public static final boolean STREAMING = true; - public static final boolean BATCH = false; - public static final Set>>> NO_EXPECTED_DLQ_EVENTS = - Collections.emptySet(); - @Rule public final transient TestPipeline streamingPipeline = TestPipeline.create(); - @Rule public final transient TestPipeline batchPipeline = TestPipeline.create(); - - static class MapEventsToKV extends DoFn>> { - - @ProcessElement - public void convert( - @Element Event event, OutputReceiver>> outputReceiver) { - outputReceiver.output(KV.of(event.getKey(), KV.of(event.getSequence(), event.getValue()))); - } - } - - static class MapStringBufferStateToString - extends DoFn, KV> { - - @ProcessElement - public void map( - @Element KV element, - OutputReceiver> outputReceiver) { - outputReceiver.output(KV.of(element.getKey(), element.getValue().toString())); - } - } +public class OrderedEventProcessorPerKeySequenceTest extends OrderedEventProcessorTestBase { @Test public void testPerfectOrderingProcessing() throws CannotProvideCoderException { @@ -142,7 +84,7 @@ public void testPerfectOrderingProcessing() throws CannotProvideCoderException { expectedOutput.add(KV.of("id-2", "a")); expectedOutput.add(KV.of("id-2", "ab")); - testProcessing( + testPerKeySequenceProcessing( events, expectedStatuses, expectedOutput, @@ -203,7 +145,7 @@ public void testOutOfSequenceProcessing() throws CannotProvideCoderException { expectedOutput.add(KV.of("id-2", "abcd")); expectedOutput.add(KV.of("id-2", "abcde")); - testProcessing( + testPerKeySequenceProcessing( events, expectedStatuses, expectedOutput, @@ -235,7 +177,7 @@ public void testUnfinishedProcessing() throws CannotProvideCoderException { expectedOutput.add(KV.of("id-2", "a")); expectedOutput.add(KV.of("id-2", "ab")); - testProcessing(events, expectedStatuses, expectedOutput, 1, 0, 1000, false); + testPerKeySequenceProcessing(events, expectedStatuses, expectedOutput, 1, 0, 1000, false); } @Test @@ -275,7 +217,7 @@ public void testHandlingOfDuplicateSequences() throws CannotProvideCoderExceptio duplicates.add(KV.of("id-1", KV.of(1L, UnprocessedEvent.create("b", Reason.duplicate)))); duplicates.add(KV.of("id-1", KV.of(3L, UnprocessedEvent.create("d", Reason.duplicate)))); - testProcessing( + testPerKeySequenceProcessing( events, expectedStatuses, expectedOutput, @@ -311,7 +253,7 @@ public void testHandlingOfCheckedExceptions() throws CannotProvideCoderException 2L, UnprocessedEvent.create(StringBuilderState.BAD_VALUE, Reason.exception_thrown)))); - testProcessing( + testPerKeySequenceProcessing( events, expectedStatuses, expectedOutput, @@ -346,7 +288,7 @@ public void testProcessingWithEveryOtherResultEmission() throws CannotProvideCod // Skipped KV.of("id-1", "abcd"), expectedOutput.add(KV.of("id-2", "a")); // Skipped KV.of("id-2", "ab") - testProcessing( + testPerKeySequenceProcessing( events, expectedStatuses, expectedOutput, @@ -428,7 +370,7 @@ public void testLargeBufferedOutputInTimer() throws CannotProvideCoderException 0, false))); - testProcessing( + testPerKeySequenceProcessing( events.toArray(new Event[events.size()]), expectedStatuses, expectedOutput, @@ -523,7 +465,7 @@ public void testSequenceGapProcessingInBufferedOutput() throws CannotProvideCode OrderedProcessingStatus.create( 10L, 0, null, null, numberOfReceivedEvents, 10L, 0, false))); - testProcessing( + testPerKeySequenceProcessing( events.toArray(new Event[events.size()]), expectedStatuses, expectedOutput, @@ -558,7 +500,7 @@ public void testHandlingOfMaxSequenceNumber() throws CannotProvideCoderException Long.MAX_VALUE, UnprocessedEvent.create("c", Reason.sequence_id_outside_valid_range)))); - testProcessing( + testPerKeySequenceProcessing( events, expectedStatuses, expectedOutput, @@ -589,7 +531,7 @@ public void testProcessingOfTheLastInput() throws CannotProvideCoderException { expectedOutput.add(KV.of("id-1", "ab")); expectedOutput.add(KV.of("id-1", "ab" + StringEventExaminer.LAST_INPUT)); - testProcessing( + testPerKeySequenceProcessing( events, expectedStatuses, expectedOutput, @@ -599,6 +541,65 @@ public void testProcessingOfTheLastInput() throws CannotProvideCoderException { DONT_PRODUCE_STATUS_ON_EVERY_EVENT); } + protected void testPerKeySequenceProcessing( + Event[] events, + Collection> expectedStatuses, + Collection> expectedOutput, + int emissionFrequency, + long initialSequence, + int maxResultsPerOutput, + boolean produceStatusOnEveryEvent) + throws CannotProvideCoderException { + testPerKeySequenceProcessing( + events, + expectedStatuses, + expectedOutput, + NO_EXPECTED_DLQ_EVENTS, + emissionFrequency, + initialSequence, + maxResultsPerOutput, + produceStatusOnEveryEvent); + } + + protected void testPerKeySequenceProcessing( + Event[] events, + Collection> expectedStatuses, + Collection> expectedOutput, + Collection>>> expectedUnprocessedEvents, + int emissionFrequency, + long initialSequence, + int maxResultsPerOutput, + boolean produceStatusOnEveryEvent) + throws CannotProvideCoderException { + // Test a streaming pipeline + doTest( + events, + expectedStatuses, + expectedOutput, + expectedUnprocessedEvents, + emissionFrequency, + initialSequence, + maxResultsPerOutput, + produceStatusOnEveryEvent, + STREAMING, + false, + ContiguousSequenceRange.EMPTY); + + // Test a batch pipeline + doTest( + events, + expectedStatuses, + expectedOutput, + expectedUnprocessedEvents, + emissionFrequency, + initialSequence, + maxResultsPerOutput, + produceStatusOnEveryEvent, + BATCH, + false, + ContiguousSequenceRange.EMPTY); + } + @Test public void testWindowedProcessing() throws CannotProvideCoderException { @@ -684,223 +685,4 @@ public void testWindowedProcessing() throws CannotProvideCoderException { pipeline.run(); } - - private void testProcessing( - Event[] events, - Collection> expectedStatuses, - Collection> expectedOutput, - int emissionFrequency, - long initialSequence, - int maxResultsPerOutput, - boolean produceStatusOnEveryEvent) - throws CannotProvideCoderException { - testProcessing( - events, - expectedStatuses, - expectedOutput, - NO_EXPECTED_DLQ_EVENTS, - emissionFrequency, - initialSequence, - maxResultsPerOutput, - produceStatusOnEveryEvent); - } - - private void testProcessing( - Event[] events, - Collection> expectedStatuses, - Collection> expectedOutput, - Collection>>> expectedUnprocessedEvents, - int emissionFrequency, - long initialSequence, - int maxResultsPerOutput, - boolean produceStatusOnEveryEvent) - throws CannotProvideCoderException { - doTest( - events, - expectedStatuses, - expectedOutput, - expectedUnprocessedEvents, - emissionFrequency, - initialSequence, - maxResultsPerOutput, - produceStatusOnEveryEvent, - STREAMING); - doTest( - events, - expectedStatuses, - expectedOutput, - expectedUnprocessedEvents, - emissionFrequency, - initialSequence, - maxResultsPerOutput, - produceStatusOnEveryEvent, - BATCH); - } - - /** - * The majority of the tests use this method. Testing is done in the global window. - * - * @param events - * @param expectedStatuses - * @param expectedOutput - * @param expectedUnprocessedEvents - * @param emissionFrequency - * @param initialSequence - * @param maxResultsPerOutput - * @param produceStatusOnEveryEvent - * @param streaming - * @throws @UnknownKeyFor @NonNull @Initialized CannotProvideCoderException - */ - private void doTest( - Event[] events, - Collection> expectedStatuses, - Collection> expectedOutput, - Collection>>> expectedUnprocessedEvents, - int emissionFrequency, - long initialSequence, - int maxResultsPerOutput, - boolean produceStatusOnEveryEvent, - boolean streaming) - throws @UnknownKeyFor @NonNull @Initialized CannotProvideCoderException { - - Pipeline pipeline = streaming ? streamingPipeline : batchPipeline; - - PCollection rawInput = - streaming - ? createStreamingPCollection(pipeline, events) - : createBatchPCollection(pipeline, events); - PCollection>> input = - rawInput.apply("To KV", ParDo.of(new MapEventsToKV())); - - StringBufferOrderedProcessingHandler handler = - new StringBufferOrderedProcessingHandler(emissionFrequency, initialSequence); - handler.setMaxOutputElementsPerBundle(maxResultsPerOutput); - if (produceStatusOnEveryEvent) { - handler.setProduceStatusUpdateOnEveryEvent(true); - // This disables status updates emitted on timers. - handler.setStatusUpdateFrequency(null); - } else { - handler.setStatusUpdateFrequency( - streaming ? Duration.standardMinutes(5) : Duration.standardSeconds(1)); - } - OrderedEventProcessor orderedEventProcessor = - OrderedEventProcessor.create(handler); - - OrderedEventProcessorResult processingResult = - input.apply("Process Events", orderedEventProcessor); - - PAssert.that("Output matches", processingResult.output()).containsInAnyOrder(expectedOutput); - - if (streaming) { - // Only in streaming the events will arrive in a pre-determined order and the statuses - // will be deterministic. In batch pipelines events can be processed in any order, - // so we skip status verification and rely on the output and unprocessed event matches. - PAssert.that("Statuses match", processingResult.processingStatuses()) - .containsInAnyOrder(expectedStatuses); - } - - // This is a temporary workaround until PAssert changes. - boolean unprocessedEventsHaveExceptionStackTrace = false; - for (KV>> event : expectedUnprocessedEvents) { - if (event.getValue().getValue().getReason() == Reason.exception_thrown) { - unprocessedEventsHaveExceptionStackTrace = true; - break; - } - } - - if (unprocessedEventsHaveExceptionStackTrace) { - PAssert.thatSingleton( - "Unprocessed event count", - processingResult - .unprocessedEvents() - .apply( - "Window", - Window.>>>into( - new GlobalWindows()) - .triggering(Repeatedly.forever(AfterWatermark.pastEndOfWindow())) - .discardingFiredPanes()) - .apply("Count", Count.globally())) - .isEqualTo((long) expectedUnprocessedEvents.size()); - } else { - PAssert.that("Unprocessed events match", processingResult.unprocessedEvents()) - .containsInAnyOrder(expectedUnprocessedEvents); - } - pipeline.run(); - } - - private @UnknownKeyFor @NonNull @Initialized PCollection createBatchPCollection( - Pipeline pipeline, Event[] events) { - return pipeline - .apply("Create Batch Events", Create.of(Arrays.asList(events))) - .apply("Reshuffle", Reshuffle.viaRandomKey()); - } - - private @UnknownKeyFor @NonNull @Initialized PCollection createStreamingPCollection( - Pipeline pipeline, Event[] events) - throws @UnknownKeyFor @NonNull @Initialized CannotProvideCoderException { - Instant now = Instant.now().minus(Duration.standardMinutes(20)); - TestStream.Builder messageFlow = - TestStream.create(pipeline.getCoderRegistry().getCoder(Event.class)) - .advanceWatermarkTo(now); - - int delayInMilliseconds = 0; - for (Event e : events) { - messageFlow = - messageFlow - .advanceWatermarkTo(now.plus(Duration.millis(++delayInMilliseconds))) - .addElements(e); - } - - // Needed to force the processing time based timers. - messageFlow = messageFlow.advanceProcessingTime(Duration.standardMinutes(15)); - return pipeline.apply("Create Streaming Events", messageFlow.advanceWatermarkToInfinity()); - } - - /** - * Unprocessed event's explanation contains stacktraces which makes tests very brittle because it - * requires hardcoding the line numbers in the code. We use this matcher to only compare on the - * first line of the explanation. - */ - static class UnprocessedEventMatcher - extends BaseMatcher>>> - implements SerializableMatcher>>> { - - private KV>> element; - - public UnprocessedEventMatcher(KV>> element) { - this.element = element; - } - - @Override - public boolean matches(Object actual) { - KV>> toMatch = - (KV>>) actual; - - UnprocessedEvent originalEvent = element.getValue().getValue(); - UnprocessedEvent eventToMatch = toMatch.getValue().getValue(); - - return element.getKey().equals(toMatch.getKey()) - && element.getValue().getKey().equals(toMatch.getValue().getKey()) - && originalEvent.getEvent().equals(eventToMatch.getEvent()) - && originalEvent.getReason() == eventToMatch.getReason() - && normalizeExplanation(originalEvent.getExplanation()) - .equals(normalizeExplanation(eventToMatch.getExplanation())); - } - - @Override - public void describeTo(Description description) { - description.appendText("Just some text..."); - } - - static String normalizeExplanation(String value) { - if (value == null) { - return ""; - } - String firstLine = value.split("\n", 1)[0]; - if (firstLine.contains("Exception")) { - return firstLine; - } - return value; - } - } } diff --git a/sdks/java/extensions/ordered/src/test/java/org/apache/beam/sdk/extensions/ordered/OrderedEventProcessorTestBase.java b/sdks/java/extensions/ordered/src/test/java/org/apache/beam/sdk/extensions/ordered/OrderedEventProcessorTestBase.java new file mode 100644 index 0000000000000..fd651b919df1b --- /dev/null +++ b/sdks/java/extensions/ordered/src/test/java/org/apache/beam/sdk/extensions/ordered/OrderedEventProcessorTestBase.java @@ -0,0 +1,395 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.extensions.ordered; + +import static org.hamcrest.MatcherAssert.assertThat; + +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.Iterator; +import java.util.Set; +import javax.annotation.Nullable; +import org.apache.beam.runners.dataflow.TestDataflowPipelineOptions; +import org.apache.beam.runners.dataflow.TestDataflowRunner; +import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.coders.CannotProvideCoderException; +import org.apache.beam.sdk.coders.Coder; +import org.apache.beam.sdk.extensions.ordered.StringBufferOrderedProcessingHandler.StringBufferOrderedProcessingWithGlobalSequenceHandler; +import org.apache.beam.sdk.extensions.ordered.UnprocessedEvent.Reason; +import org.apache.beam.sdk.testing.PAssert; +import org.apache.beam.sdk.testing.SerializableMatcher; +import org.apache.beam.sdk.testing.TestPipeline; +import org.apache.beam.sdk.testing.TestStream; +import org.apache.beam.sdk.transforms.Count; +import org.apache.beam.sdk.transforms.Create; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.PTransform; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.transforms.Reshuffle; +import org.apache.beam.sdk.transforms.SerializableFunction; +import org.apache.beam.sdk.transforms.windowing.AfterWatermark; +import org.apache.beam.sdk.transforms.windowing.GlobalWindows; +import org.apache.beam.sdk.transforms.windowing.Repeatedly; +import org.apache.beam.sdk.transforms.windowing.Window; +import org.apache.beam.sdk.values.KV; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.PCollection.IsBounded; +import org.apache.beam.sdk.values.PCollectionView; +import org.checkerframework.checker.initialization.qual.Initialized; +import org.checkerframework.checker.nullness.qual.NonNull; +import org.checkerframework.checker.nullness.qual.UnknownKeyFor; +import org.hamcrest.BaseMatcher; +import org.hamcrest.Description; +import org.joda.time.Duration; +import org.joda.time.Instant; +import org.junit.Rule; + +/** + * Ordered Processing tests use the same testing scenario. Events are sent in or out of sequence. + * Each event is a string for a particular key. The output is a concatenation of all strings. + */ +public class OrderedEventProcessorTestBase { + + public static final boolean LAST_EVENT_RECEIVED = true; + public static final int EMISSION_FREQUENCY_ON_EVERY_ELEMENT = 1; + public static final int INITIAL_SEQUENCE_OF_0 = 0; + public static final boolean DONT_PRODUCE_STATUS_ON_EVERY_EVENT = false; + public static final int LARGE_MAX_RESULTS_PER_OUTPUT = 1000; + public static final int EMISSION_FREQUENCY_ON_EVERY_OTHER_EVENT = 2; + public static final boolean PRODUCE_STATUS_ON_EVERY_EVENT = true; + public static final boolean STREAMING = true; + public static final boolean BATCH = false; + public static final Set>>> NO_EXPECTED_DLQ_EVENTS = + Collections.emptySet(); + @Rule public final transient TestPipeline streamingPipeline = TestPipeline.create(); + @Rule public final transient TestPipeline batchPipeline = TestPipeline.create(); + + protected boolean runTestsOnDataflowRunner() { + return Boolean.getBoolean("run-tests-on-dataflow"); + } + + protected String getSystemProperty(String name) { + String property = System.getProperty(name); + if (property == null) { + throw new IllegalStateException("Unable to find system property '" + name + "'"); + } + return property; + } + + static class MapEventsToKV extends DoFn>> { + + @ProcessElement + public void convert( + @Element Event event, OutputReceiver>> outputReceiver) { + outputReceiver.output(KV.of(event.getKey(), KV.of(event.getSequence(), event.getValue()))); + } + } + + static class MapStringBufferStateToString + extends DoFn, KV> { + + @ProcessElement + public void map( + @Element KV element, + OutputReceiver> outputReceiver) { + outputReceiver.output(KV.of(element.getKey(), element.getValue().toString())); + } + } + + /** + * The majority of the tests use this method. Testing is done in the global window. + * + * @throws @UnknownKeyFor @NonNull @Initialized CannotProvideCoderException + */ + protected void doTest( + Event[] events, + @Nullable Collection> expectedStatuses, + Collection> expectedOutput, + Collection>>> expectedUnprocessedEvents, + int emissionFrequency, + long initialSequence, + int maxResultsPerOutput, + boolean produceStatusOnEveryEvent, + boolean streaming, + boolean isGlobalSequence, + @Nullable ContiguousSequenceRange expectedLastCompletedSequence) + throws @UnknownKeyFor @NonNull @Initialized CannotProvideCoderException { + + Pipeline pipeline = streaming ? streamingPipeline : batchPipeline; + if (runTestsOnDataflowRunner()) { + pipeline.getOptions().setRunner(TestDataflowRunner.class); + TestDataflowPipelineOptions options = + pipeline.getOptions().as(TestDataflowPipelineOptions.class); + options.setExperiments(Arrays.asList("disable_runner_v2")); + options.setTempRoot("gs://" + getSystemProperty("temp_dataflow_bucket")); + } + PCollection rawInput = + streaming + ? createStreamingPCollection(pipeline, events) + : createBatchPCollection(pipeline, events); + PCollection>> input = + rawInput.apply("To KV", ParDo.of(new MapEventsToKV())); + + OrderedProcessingHandler handler = + isGlobalSequence + ? new StringBufferOrderedProcessingWithGlobalSequenceHandler( + emissionFrequency, initialSequence) + : new StringBufferOrderedProcessingHandler(emissionFrequency, initialSequence); + handler.setMaxOutputElementsPerBundle(maxResultsPerOutput); + if (produceStatusOnEveryEvent) { + handler.setProduceStatusUpdateOnEveryEvent(true); + // This disables status updates emitted on timers. + handler.setStatusUpdateFrequency(null); + } else { + handler.setStatusUpdateFrequency( + streaming ? Duration.standardMinutes(5) : Duration.standardSeconds(1)); + } + + OrderedEventProcessor orderedEventProcessor = + OrderedEventProcessor.create(handler); + + OrderedEventProcessorResult processingResult = + input.apply("Process Events", orderedEventProcessor); + + PAssert.that("Output matches", processingResult.output()).containsInAnyOrder(expectedOutput); + + if (streaming && expectedStatuses != null) { + // Only in a streaming pipeline the events will arrive in a pre-determined order and the + // statuses + // will be deterministic. In batch pipelines events can be processed in any order, + // so we skip status verification and rely on the output and unprocessed event matches. + PAssert.that("Statuses match", processingResult.processingStatuses()) + .containsInAnyOrder(expectedStatuses); + } + + // This is a temporary workaround until PAssert changes. + boolean unprocessedEventsHaveExceptionStackTrace = false; + for (KV>> event : expectedUnprocessedEvents) { + if (event.getValue().getValue().getReason() == Reason.exception_thrown) { + unprocessedEventsHaveExceptionStackTrace = true; + break; + } + } + + if (unprocessedEventsHaveExceptionStackTrace) { + PAssert.thatSingleton( + "Unprocessed event count", + processingResult + .unprocessedEvents() + .apply( + "Window", + Window.>>>into( + new GlobalWindows()) + .triggering(Repeatedly.forever(AfterWatermark.pastEndOfWindow())) + .discardingFiredPanes()) + .apply("Count", Count.globally())) + .isEqualTo((long) expectedUnprocessedEvents.size()); + } else { + PAssert.that("Unprocessed events match", processingResult.unprocessedEvents()) + .containsInAnyOrder(expectedUnprocessedEvents); + } + + if (expectedLastCompletedSequence != null && processingResult.latestContiguousRange() != null) { + PCollection globalSequences = + rawInput.apply( + "Publish Global Sequences", + new GlobalSequenceRangePublisher( + processingResult.latestContiguousRange(), + handler.getKeyCoder(pipeline, input.getCoder()), + handler.getEventCoder(pipeline, input.getCoder()))); + PAssert.that("CompletedSequenceRange verification", globalSequences) + .satisfies(new LastExpectedGlobalSequenceRangeMatcher(expectedLastCompletedSequence)); + } + pipeline.run(); + } + + static class LastExpectedGlobalSequenceRangeMatcher + implements SerializableFunction, Void> { + + private final long expectedStart; + private final long expectedEnd; + + LastExpectedGlobalSequenceRangeMatcher(ContiguousSequenceRange expected) { + this.expectedStart = expected.getStart(); + this.expectedEnd = expected.getEnd(); + } + + @Override + public Void apply(Iterable input) { + StringBuilder listOfRanges = new StringBuilder("["); + Iterator iterator = input.iterator(); + ContiguousSequenceRange lastRange = null; + while (iterator.hasNext()) { + lastRange = iterator.next(); + + if (listOfRanges.length() > 1) { + listOfRanges.append(", "); + } + listOfRanges.append(lastRange); + } + listOfRanges.append(']'); + boolean foundExpectedRange = + lastRange != null + && lastRange.getStart() == expectedStart + && lastRange.getEnd() == expectedEnd; + + assertThat( + "Expected range not found: [" + + expectedStart + + '-' + + expectedEnd + + "], received ranges: " + + listOfRanges, + foundExpectedRange); + return null; + } + } + + private @UnknownKeyFor @NonNull @Initialized PCollection createBatchPCollection( + Pipeline pipeline, Event[] events) { + return pipeline + .apply("Create Batch Events", Create.of(Arrays.asList(events))) + .apply("Reshuffle", Reshuffle.viaRandomKey()); + } + + private @UnknownKeyFor @NonNull @Initialized PCollection createStreamingPCollection( + Pipeline pipeline, Event[] events) + throws @UnknownKeyFor @NonNull @Initialized CannotProvideCoderException { + Instant now = Instant.now().minus(Duration.standardMinutes(20)); + TestStream.Builder messageFlow = + TestStream.create(pipeline.getCoderRegistry().getCoder(Event.class)) + .advanceWatermarkTo(now); + + int delayInMilliseconds = 0; + for (Event e : events) { + messageFlow = + messageFlow + .advanceWatermarkTo(now.plus(Duration.millis(++delayInMilliseconds))) + .addElements(e); + } + + // Needed to force the processing time based timers. + messageFlow = messageFlow.advanceProcessingTime(Duration.standardMinutes(15)); + return pipeline.apply("Create Streaming Events", messageFlow.advanceWatermarkToInfinity()); + } + + /** + * Unprocessed event's explanation contains stacktraces which makes tests very brittle because it + * requires hardcoding the line numbers in the code. We use this matcher to only compare on the + * first line of the explanation. + */ + static class UnprocessedEventMatcher + extends BaseMatcher>>> + implements SerializableMatcher>>> { + + private KV>> element; + + public UnprocessedEventMatcher(KV>> element) { + this.element = element; + } + + @Override + public boolean matches(Object actual) { + KV>> toMatch = + (KV>>) actual; + + UnprocessedEvent originalEvent = element.getValue().getValue(); + UnprocessedEvent eventToMatch = toMatch.getValue().getValue(); + + return element.getKey().equals(toMatch.getKey()) + && element.getValue().getKey().equals(toMatch.getValue().getKey()) + && originalEvent.getEvent().equals(eventToMatch.getEvent()) + && originalEvent.getReason() == eventToMatch.getReason() + && normalizeExplanation(originalEvent.getExplanation()) + .equals(normalizeExplanation(eventToMatch.getExplanation())); + } + + @Override + public void describeTo(Description description) { + description.appendText("Just some text..."); + } + + static String normalizeExplanation(String value) { + if (value == null) { + return ""; + } + String firstLine = value.split("\n", 1)[0]; + if (firstLine.contains("Exception")) { + return firstLine; + } + return value; + } + } + + static class GlobalSequenceRangePublisher + extends PTransform, PCollection> { + + private final PCollectionView lastCompletedSequenceRangeView; + private final Coder keyCoder; + private final Coder eventCoder; + + public GlobalSequenceRangePublisher( + PCollectionView latestCompletedSequenceRange, + Coder keyCoder, + Coder eventCoder) { + this.lastCompletedSequenceRangeView = latestCompletedSequenceRange; + this.keyCoder = keyCoder; + this.eventCoder = eventCoder; + } + + @Override + public PCollection expand(PCollection input) { + PCollection>> events = + input + // In production pipelines the global sequence will typically be obtained + // by using GenerateSequence. But GenerateSequence doesn't work well with TestStream, + // That's why we use the input events here. + // .apply("Create Ticker", + // GenerateSequence.from(0).to(2).withRate(1, + // Duration.standardSeconds(5))) + .apply("To KV", ParDo.of(new MapEventsToKV())); + if (input.isBounded() == IsBounded.BOUNDED) { + return events.apply( + "Emit SideInput", + ParDo.of(new SideInputEmitter()) + .withSideInput("lastCompletedSequence", lastCompletedSequenceRangeView)); + } else { + PCollection>> tickers = + events.apply( + "Create Tickers", + new PerKeyTickerGenerator<>(keyCoder, eventCoder, Duration.standardSeconds(1))); + return tickers.apply( + "Emit SideInput", + ParDo.of(new SideInputEmitter()) + .withSideInput("lastCompletedSequence", lastCompletedSequenceRangeView)); + } + } + + static class SideInputEmitter + extends DoFn>, ContiguousSequenceRange> { + + @ProcessElement + public void produceCompletedRange( + @SideInput("lastCompletedSequence") ContiguousSequenceRange sideInput, + OutputReceiver outputReceiver) { + outputReceiver.output(sideInput); + } + } + } +} diff --git a/sdks/java/extensions/ordered/src/test/java/org/apache/beam/sdk/extensions/ordered/StringBufferOrderedProcessingHandler.java b/sdks/java/extensions/ordered/src/test/java/org/apache/beam/sdk/extensions/ordered/StringBufferOrderedProcessingHandler.java index 72f3a3cf21b68..1da46c3262e4c 100644 --- a/sdks/java/extensions/ordered/src/test/java/org/apache/beam/sdk/extensions/ordered/StringBufferOrderedProcessingHandler.java +++ b/sdks/java/extensions/ordered/src/test/java/org/apache/beam/sdk/extensions/ordered/StringBufferOrderedProcessingHandler.java @@ -27,6 +27,24 @@ public class StringBufferOrderedProcessingHandler extends OrderedProcessingHandler { + public static class StringBufferOrderedProcessingWithGlobalSequenceHandler + extends OrderedProcessingGlobalSequenceHandler { + + private final EventExaminer eventExaminer; + + public StringBufferOrderedProcessingWithGlobalSequenceHandler( + int emissionFrequency, long initialSequence) { + super(String.class, String.class, StringBuilderState.class, String.class); + this.eventExaminer = new StringEventExaminer(initialSequence, emissionFrequency); + } + + @Override + @NonNull + public EventExaminer getEventExaminer() { + return eventExaminer; + } + } + private final EventExaminer eventExaminer; public StringBufferOrderedProcessingHandler(int emissionFrequency, long initialSequence) { diff --git a/sdks/java/extensions/ordered/src/test/java/org/apache/beam/sdk/extensions/ordered/combiner/SequenceRangeAccumulatorCoderTest.java b/sdks/java/extensions/ordered/src/test/java/org/apache/beam/sdk/extensions/ordered/combiner/SequenceRangeAccumulatorCoderTest.java new file mode 100644 index 0000000000000..0e5b0b7c819a5 --- /dev/null +++ b/sdks/java/extensions/ordered/src/test/java/org/apache/beam/sdk/extensions/ordered/combiner/SequenceRangeAccumulatorCoderTest.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.extensions.ordered.combiner; + +import static org.junit.Assert.assertEquals; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import org.apache.beam.sdk.extensions.ordered.combiner.SequenceRangeAccumulator.SequenceRangeAccumulatorCoder; +import org.joda.time.Instant; +import org.junit.Test; + +public class SequenceRangeAccumulatorCoderTest { + + private SequenceRangeAccumulatorCoder coder = SequenceRangeAccumulatorCoder.of(); + + @Test + public void testEncodingEmptyAccumulator() throws IOException { + SequenceRangeAccumulator empty = new SequenceRangeAccumulator(); + + doTestEncodingAndDecoding(empty); + } + + @Test + public void testEncodingAccumulatorWithoutInitialSequence() throws IOException { + SequenceRangeAccumulator accumulator = new SequenceRangeAccumulator(); + accumulator.add(1, Instant.now(), false); + accumulator.add(2, Instant.now(), false); + accumulator.add(3, Instant.now(), false); + accumulator.add(5, Instant.now(), false); + accumulator.add(6, Instant.now(), false); + + doTestEncodingAndDecoding(accumulator); + } + + @Test + public void testEncodingAccumulatorWithInitialSequence() throws IOException { + SequenceRangeAccumulator accumulator = new SequenceRangeAccumulator(); + accumulator.add(1, Instant.now(), true); + accumulator.add(2, Instant.now(), false); + accumulator.add(3, Instant.now(), false); + accumulator.add(5, Instant.now(), false); + accumulator.add(6, Instant.now(), false); + + doTestEncodingAndDecoding(accumulator); + } + + private void doTestEncodingAndDecoding(SequenceRangeAccumulator value) throws IOException { + ByteArrayOutputStream output = new ByteArrayOutputStream(); + coder.encode(value, output); + + SequenceRangeAccumulator decoded = coder.decode(new ByteArrayInputStream(output.toByteArray())); + assertEquals("Accumulator", value, decoded); + } +} diff --git a/sdks/java/extensions/ordered/src/test/java/org/apache/beam/sdk/extensions/ordered/combiner/SequenceRangeAccumulatorTest.java b/sdks/java/extensions/ordered/src/test/java/org/apache/beam/sdk/extensions/ordered/combiner/SequenceRangeAccumulatorTest.java new file mode 100644 index 0000000000000..4082ce6de7585 --- /dev/null +++ b/sdks/java/extensions/ordered/src/test/java/org/apache/beam/sdk/extensions/ordered/combiner/SequenceRangeAccumulatorTest.java @@ -0,0 +1,400 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.extensions.ordered.combiner; + +import java.util.Arrays; +import java.util.concurrent.atomic.AtomicLong; +import org.apache.beam.sdk.extensions.ordered.ContiguousSequenceRange; +import org.joda.time.Instant; +import org.junit.Assert; +import org.junit.Test; + +public class SequenceRangeAccumulatorTest { + + // Atomic just in case tests are run in parallel + private static final AtomicLong currentTicker = new AtomicLong(); + + static Instant nextTimestamp() { + return Instant.ofEpochMilli(currentTicker.getAndIncrement()); + } + + static Instant eventTimestamp(Event[] events, long eventSequence) { + for (Event e : events) { + if (e.sequence == eventSequence) { + return e.timestamp; + } + } + throw new IllegalStateException("Unable to find event with sequence " + eventSequence); + } + + static class Event { + + long sequence; + Instant timestamp; + boolean initialEvent; + + Event(long sequence, Instant ts) { + this(sequence, ts, false); + } + + Event(long sequence, Instant ts, boolean initialEvent) { + this.sequence = sequence; + this.timestamp = ts; + this.initialEvent = initialEvent; + } + } + + @Test + public void testSimpleAccumulation() { + Event[] events = + new Event[] { + new Event(1, nextTimestamp(), true), + new Event(2, nextTimestamp()), + new Event(3, nextTimestamp()) + }; + + doTestAccumulation(events, ContiguousSequenceRange.of(1, 4, eventTimestamp(events, 3)), 1); + } + + @Test + public void testReverseArrivalHandling() { + Event[] events = + new Event[] { + new Event(3, nextTimestamp()), + new Event(2, nextTimestamp()), + new Event(1, nextTimestamp(), true) + }; + + Instant timestampOfEventNumber1 = eventTimestamp(events, 1); + doTestAccumulation(events, ContiguousSequenceRange.of(1, 4, timestampOfEventNumber1), 1); + } + + @Test + public void testPartialRangeAccumulation() { + Event[] events = + new Event[] { + new Event(1, nextTimestamp(), true), + new Event(2, nextTimestamp()), + new Event(3, nextTimestamp()), + new Event(5, nextTimestamp()), + new Event(7, nextTimestamp()), + }; + + doTestAccumulation(events, ContiguousSequenceRange.of(1, 4, eventTimestamp(events, 3)), 3); + } + + @Test + public void testMergingRangeAccumulation() { + Event[] events = + new Event[] { + new Event(1, nextTimestamp(), true), + new Event(2, nextTimestamp()), + new Event(3, nextTimestamp()), + new Event(5, nextTimestamp()), + new Event(7, nextTimestamp()), + new Event(6, nextTimestamp()), + }; + + doTestAccumulation(events, ContiguousSequenceRange.of(1, 4, eventTimestamp(events, 3)), 2); + } + + @Test + public void testNoStartEvent() { + Event[] events = + new Event[] { + new Event(2, nextTimestamp()), + new Event(3, nextTimestamp()), + new Event(1, nextTimestamp()), + new Event(5, nextTimestamp()), + }; + + doTestAccumulation(events, ContiguousSequenceRange.EMPTY, 2); + } + + @Test + public void testNoEventsAccumulation() { + Event[] events = new Event[] {}; + + doTestAccumulation(events, ContiguousSequenceRange.EMPTY, 0); + } + + @Test + public void testRemovingRangesBelowInitialSequenceDuringAccumulation() { + Event[] events = + new Event[] { + // First range + new Event(2, nextTimestamp()), + new Event(3, nextTimestamp()), + new Event(1, nextTimestamp()), + + // Second range + new Event(5, nextTimestamp()), + new Event(6, nextTimestamp()), + + // This event should prune everything below + new Event(7, nextTimestamp(), true), + }; + + doTestAccumulation(events, ContiguousSequenceRange.of(7, 8, eventTimestamp(events, 7)), 1); + } + + @Test + public void testRemovingElementsBelowInitialSequenceDuringAccumulation() { + + Event[] events = + new Event[] { + // First range + new Event(2, nextTimestamp()), + new Event(3, nextTimestamp()), + new Event(1, nextTimestamp()), + + // Second range + new Event(5, nextTimestamp()), + new Event(6, nextTimestamp()), + new Event(7, nextTimestamp()), + new Event(8, nextTimestamp()), + + // This event should reduce the range. + new Event(7, nextTimestamp(), true), + }; + + Instant timestampOfTheLastEvent = events[events.length - 1].timestamp; + doTestAccumulation(events, ContiguousSequenceRange.of(7, 9, timestampOfTheLastEvent), 1); + } + + private static void doTestAccumulation( + Event[] events, ContiguousSequenceRange expectedResult, int expectedNumberOfRanges) { + SequenceRangeAccumulator accumulator = new SequenceRangeAccumulator(); + Arrays.stream(events).forEach(e -> accumulator.add(e.sequence, e.timestamp, e.initialEvent)); + + Assert.assertEquals( + "Accumulated results", expectedResult, accumulator.largestContinuousRange()); + + Assert.assertEquals("Number of ranges", expectedNumberOfRanges, accumulator.numberOfRanges()); + } + + @Test + public void testEmptyMerge() { + Event[] set1 = new Event[] {}; + Event[] set2 = new Event[] {}; + + ContiguousSequenceRange expectedResult = ContiguousSequenceRange.EMPTY; + int expectedNumberOfRanges = 0; + + doTestMerging(set1, set2, expectedResult, expectedNumberOfRanges); + } + + @Test + public void testMergingNonEmptyWithEmpty() { + Event[] set1 = + new Event[] { + new Event(3, nextTimestamp()), + new Event(2, nextTimestamp()), + new Event(1, nextTimestamp(), true) + }; + Event[] set2 = new Event[] {}; + + ContiguousSequenceRange expectedResult = + ContiguousSequenceRange.of(1, 4, eventTimestamp(set1, 1L)); + int expectedNumberOfRanges = 1; + + doTestMerging(set1, set2, expectedResult, expectedNumberOfRanges); + } + + @Test + public void testMergingWithLowerNonAdjacentRange() { + Event[] set1 = + new Event[] { + new Event(1, nextTimestamp(), true), new Event(2, nextTimestamp()), + }; + Event[] set2 = + new Event[] { + new Event(4, nextTimestamp()), + new Event(5, nextTimestamp()), + new Event(6, nextTimestamp()) + }; + + ContiguousSequenceRange expectedResult = + ContiguousSequenceRange.of(1, 3, eventTimestamp(set1, 2L)); + int expectedNumberOfRanges = 2; + + doTestMerging(set1, set2, expectedResult, expectedNumberOfRanges); + } + + @Test + public void testMergingWithoutAnyInitialEvents() { + Event[] set1 = + new Event[] { + new Event(1, nextTimestamp()), new Event(2, nextTimestamp()), + }; + Event[] set2 = + new Event[] { + new Event(4, nextTimestamp()), + new Event(5, nextTimestamp()), + new Event(6, nextTimestamp()) + }; + + ContiguousSequenceRange expectedResult = ContiguousSequenceRange.EMPTY; + int expectedNumberOfRanges = 2; + + doTestMerging(set1, set2, expectedResult, expectedNumberOfRanges); + } + + @Test + public void testMergingAdjacentRanges() { + Event[] set1 = + new Event[] { + new Event(1, nextTimestamp(), true), new Event(2, nextTimestamp()), + }; + Event[] set2 = + new Event[] { + new Event(3, nextTimestamp()), + new Event(4, nextTimestamp()), + new Event(5, nextTimestamp()), + new Event(6, nextTimestamp()) + }; + + ContiguousSequenceRange expectedResult = + ContiguousSequenceRange.of(1, 7, eventTimestamp(set2, 6L)); + int expectedNumberOfRanges = 1; + + doTestMerging(set1, set2, expectedResult, expectedNumberOfRanges); + } + + @Test + public void testPruningSequencesBelowInitial() { + Event[] set1 = + new Event[] { + new Event(1, nextTimestamp()), new Event(2, nextTimestamp()), + }; + Event[] set2 = + new Event[] { + new Event(3, nextTimestamp(), true), + new Event(4, nextTimestamp()), + new Event(5, nextTimestamp()), + new Event(6, nextTimestamp()) + }; + + ContiguousSequenceRange expectedResult = + ContiguousSequenceRange.of(3, 7, eventTimestamp(set2, 6L)); + int expectedNumberOfRanges = 1; + + doTestMerging(set1, set2, expectedResult, expectedNumberOfRanges); + } + + @Test + public void testDuplicateHandling() { + Event[] set1 = + new Event[] { + new Event(1, nextTimestamp(), true), + new Event(2, nextTimestamp()), + new Event(3, nextTimestamp()), + new Event(5, nextTimestamp()), + }; + Event[] set2 = + new Event[] { + new Event(3, nextTimestamp()), + new Event(4, nextTimestamp()), + new Event(5, nextTimestamp()), + new Event(6, nextTimestamp()) + }; + + ContiguousSequenceRange expectedResult = + ContiguousSequenceRange.of(1, 7, eventTimestamp(set2, 6L)); + int expectedNumberOfRanges = 1; + + doTestMerging(set1, set2, expectedResult, expectedNumberOfRanges); + } + + @Test + public void testExceptionThrownIfThereAreDifferentInitialSequences() { + Event[] set1 = + new Event[] { + new Event(1, nextTimestamp(), true), new Event(2, nextTimestamp()), + }; + Event[] set2 = + new Event[] { + new Event(3, nextTimestamp(), true), + new Event(4, nextTimestamp()), + new Event(5, nextTimestamp()), + new Event(6, nextTimestamp()) + }; + + try { + doTestMerging(set1, set2, ContiguousSequenceRange.EMPTY, 0); + Assert.fail("Expected to throw an exception"); + } catch (IllegalStateException e) { + Assert.assertEquals( + "Exception message", + "Two accumulators contain different initial sequences: 1 and 3", + e.getMessage()); + } + } + + @Test + public void testSelectingHighestTimestampWhenMerging() { + Event[] set1 = + new Event[] { + new Event(1, nextTimestamp(), true), + new Event(2, Instant.ofEpochMilli(currentTicker.get() + 10000)), + }; + Event[] set2 = + new Event[] { + new Event(3, nextTimestamp()), + new Event(4, nextTimestamp()), + new Event(5, nextTimestamp()), + new Event(6, nextTimestamp()) + }; + + ContiguousSequenceRange expectedResult = + ContiguousSequenceRange.of(1, 7, eventTimestamp(set1, 2L)); + int expectedNumberOfRanges = 1; + doTestMerging(set1, set2, expectedResult, expectedNumberOfRanges); + } + + private static void doTestMerging( + Event[] set1, + Event[] set2, + ContiguousSequenceRange expectedResult, + int expectedNumberOfRanges) { + // Try to merge both set2 to set1 and set1 to set2 - both must return the same results + mergeAndTest(set1, set2, expectedResult, expectedNumberOfRanges, "set1"); + mergeAndTest(set2, set1, expectedResult, expectedNumberOfRanges, "set2"); + } + + private static void mergeAndTest( + Event[] set1, + Event[] set2, + ContiguousSequenceRange expectedResult, + int expectedNumberOfRanges, + String firstSetName) { + final SequenceRangeAccumulator a1 = new SequenceRangeAccumulator(); + Arrays.stream(set1).forEach(e -> a1.add(e.sequence, e.timestamp, e.initialEvent)); + + final SequenceRangeAccumulator a2 = new SequenceRangeAccumulator(); + Arrays.stream(set2).forEach(e -> a2.add(e.sequence, e.timestamp, e.initialEvent)); + + a1.merge(a2); + + Assert.assertEquals( + "Accumulated results - " + firstSetName, expectedResult, a1.largestContinuousRange()); + + Assert.assertEquals( + "Number of ranges - " + firstSetName, expectedNumberOfRanges, a1.numberOfRanges()); + } +} From c243491254896e039e912662e4cfbe4bd38c766f Mon Sep 17 00:00:00 2001 From: reuvenlax Date: Wed, 9 Oct 2024 11:09:45 -0700 Subject: [PATCH 13/14] Merge pull request #32705: fix schema inference for parameterized types --- .../java/org/apache/beam/sdk/Pipeline.java | 2 +- .../apache/beam/sdk/coders/CoderRegistry.java | 44 ++++++++++++++----- .../beam/sdk/schemas/SchemaRegistryTest.java | 20 +++++++++ 3 files changed, 54 insertions(+), 12 deletions(-) diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/Pipeline.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/Pipeline.java index d3b58dd26bd24..9006035279f32 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/Pipeline.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/Pipeline.java @@ -335,7 +335,7 @@ public PipelineResult run(PipelineOptions options) { /** Returns the {@link CoderRegistry} that this {@link Pipeline} uses. */ public CoderRegistry getCoderRegistry() { if (coderRegistry == null) { - coderRegistry = CoderRegistry.createDefault(); + coderRegistry = CoderRegistry.createDefault(getSchemaRegistry()); } return coderRegistry; } diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/coders/CoderRegistry.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/coders/CoderRegistry.java index df64789ac3d27..e404665e4f66d 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/coders/CoderRegistry.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/coders/CoderRegistry.java @@ -42,6 +42,8 @@ import org.apache.beam.sdk.io.fs.MetadataCoder; import org.apache.beam.sdk.io.fs.ResourceId; import org.apache.beam.sdk.io.fs.ResourceIdCoder; +import org.apache.beam.sdk.schemas.NoSuchSchemaException; +import org.apache.beam.sdk.schemas.SchemaRegistry; import org.apache.beam.sdk.transforms.SerializableFunction; import org.apache.beam.sdk.transforms.windowing.IntervalWindow; import org.apache.beam.sdk.util.CoderUtils; @@ -195,11 +197,17 @@ public Coder coderFor( * the lexicographically smallest {@link Class#getName() class name} being used. * */ + public static CoderRegistry createDefault(@Nullable SchemaRegistry schemaRegistry) { + return new CoderRegistry(schemaRegistry); + } + + /** Backwards compatible version of createDefault. */ public static CoderRegistry createDefault() { - return new CoderRegistry(); + return new CoderRegistry(null); } - private CoderRegistry() { + private CoderRegistry(@Nullable SchemaRegistry schemaRegistry) { + this.schemaRegistry = schemaRegistry; coderProviders = new ArrayDeque<>(REGISTERED_CODER_FACTORIES); } @@ -590,6 +598,8 @@ private static boolean isNullOrEmpty(Collection c) { /** The list of {@link CoderProvider coder providers} to use to provide Coders. */ private ArrayDeque coderProviders; + private final @Nullable SchemaRegistry schemaRegistry; + /** * Returns a {@link Coder} to use for values of the given type, in a context where the given types * use the given coders. @@ -650,16 +660,28 @@ private Coder getCoderFromParameterizedType( List> typeArgumentCoders = new ArrayList<>(); for (Type typeArgument : type.getActualTypeArguments()) { - try { - Coder typeArgumentCoder = - getCoderFromTypeDescriptor(TypeDescriptor.of(typeArgument), typeCoderBindings); - typeArgumentCoders.add(typeArgumentCoder); - } catch (CannotProvideCoderException exc) { - throw new CannotProvideCoderException( - String.format( - "Cannot provide coder for parameterized type %s: %s", type, exc.getMessage()), - exc); + Coder typeArgumentCoder = null; + if (schemaRegistry != null) { + TypeDescriptor typeDescriptor = TypeDescriptor.of(typeArgument); + try { + typeArgumentCoder = schemaRegistry.getSchemaCoder(typeDescriptor); + } catch (NoSuchSchemaException e) { + // No schema. + } + } + + if (typeArgumentCoder == null) { + try { + typeArgumentCoder = + getCoderFromTypeDescriptor(TypeDescriptor.of(typeArgument), typeCoderBindings); + } catch (CannotProvideCoderException exc) { + throw new CannotProvideCoderException( + String.format( + "Cannot provide coder for parameterized type %s: %s", type, exc.getMessage()), + exc); + } } + typeArgumentCoders.add(typeArgumentCoder); } return getCoderFromFactories(TypeDescriptor.of(type), typeArgumentCoders); } diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/SchemaRegistryTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/SchemaRegistryTest.java index 55a16e9faf391..54c80747b13bc 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/SchemaRegistryTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/SchemaRegistryTest.java @@ -26,6 +26,10 @@ import com.google.auto.service.AutoService; import com.google.auto.value.AutoValue; import java.util.List; +import org.apache.beam.sdk.coders.CannotProvideCoderException; +import org.apache.beam.sdk.coders.Coder; +import org.apache.beam.sdk.coders.CoderRegistry; +import org.apache.beam.sdk.coders.IterableCoder; import org.apache.beam.sdk.schemas.annotations.DefaultSchema; import org.apache.beam.sdk.schemas.utils.TestJavaBeans.SimpleBean; import org.apache.beam.sdk.schemas.utils.TestPOJOs.SimplePOJO; @@ -223,6 +227,22 @@ public void testRegisterPojo() throws NoSuchSchemaException { assertTrue(SIMPLE_POJO_SCHEMA.equivalent(schema)); } + @Test + public void testSchemaTypeParameterInsideCoder() throws CannotProvideCoderException { + SchemaRegistry schemaRegistry = SchemaRegistry.createDefault(); + schemaRegistry.registerPOJO(SimplePOJO.class); + + CoderRegistry coderRegistry = CoderRegistry.createDefault(schemaRegistry); + Coder> coder = + coderRegistry.getCoder(TypeDescriptors.iterables(TypeDescriptor.of(SimplePOJO.class))); + assertTrue(coder instanceof IterableCoder); + assertEquals(1, coder.getCoderArguments().size()); + assertTrue(coder.getCoderArguments().get(0) instanceof SchemaCoder); + assertTrue( + SIMPLE_POJO_SCHEMA.equivalent( + ((SchemaCoder) coder.getCoderArguments().get(0)).getSchema())); + } + @Test public void testRegisterJavaBean() throws NoSuchSchemaException { SchemaRegistry registry = SchemaRegistry.createDefault(); From 2ee6100980b4661a9db88d507c8b2c667f07b1d4 Mon Sep 17 00:00:00 2001 From: Dmitry Ulyumdzhiev <59957689+deadb0d4@users.noreply.github.com> Date: Wed, 9 Oct 2024 20:11:38 +0100 Subject: [PATCH 14/14] Handle Date type in HCatToRow (#32695) * Handle Date type in HCatToRow Some initial notes: - The issue (#20685) deals with java.sql.Date, which I wasn't able to reproduce fully (I can currently write hcatalog hadoop.hive date) - On this note, 267f76f3c2036c27dcbc94c563ecd1a2d4481f65 changed the code involved so that there's a direct cast to AbstractInstant in RowUtils.java. This doesn't change much, but jfyi. * Run: ./gradlew :sdks:java:io:hcatalog:spotlessApply * review cr: castTypes util - s/castHDate/maybeCastHDate/ to be more concise - move values manipulation to a separate util (hopefully, I understood the cr in the right way) --- .../beam/sdk/io/hcatalog/HCatToRow.java | 17 +++++++- .../beam/sdk/io/hcatalog/HCatalogIOTest.java | 41 +++++++++++++++++++ .../io/hcatalog/test/HCatalogIOTestUtils.java | 10 +++++ 3 files changed, 67 insertions(+), 1 deletion(-) diff --git a/sdks/java/io/hcatalog/src/main/java/org/apache/beam/sdk/io/hcatalog/HCatToRow.java b/sdks/java/io/hcatalog/src/main/java/org/apache/beam/sdk/io/hcatalog/HCatToRow.java index 8e29650f3fc3e..e5bdf18ecbcf4 100644 --- a/sdks/java/io/hcatalog/src/main/java/org/apache/beam/sdk/io/hcatalog/HCatToRow.java +++ b/sdks/java/io/hcatalog/src/main/java/org/apache/beam/sdk/io/hcatalog/HCatToRow.java @@ -17,6 +17,8 @@ */ package org.apache.beam.sdk.io.hcatalog; +import java.util.List; +import java.util.stream.Collectors; import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.transforms.DoFn; import org.apache.beam.sdk.transforms.PTransform; @@ -25,6 +27,7 @@ import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.Row; import org.apache.hive.hcatalog.data.HCatRecord; +import org.joda.time.Instant; /** Utilities to convert {@link HCatRecord HCatRecords} to {@link Row Rows}. */ @SuppressWarnings({ @@ -74,6 +77,18 @@ public PCollection expand(PBegin input) { private static class HCatToRowFn extends DoFn { private final Schema schema; + private Object maybeCastHDate(Object obj) { + if (obj instanceof org.apache.hadoop.hive.common.type.Date) { + return new Instant(((org.apache.hadoop.hive.common.type.Date) obj).toEpochMilli()); + } + return obj; + } + + /** Cast objects of the types that aren't supported by {@link Row}. */ + private List castTypes(List values) { + return values.stream().map(this::maybeCastHDate).collect(Collectors.toList()); + } + HCatToRowFn(Schema schema) { this.schema = schema; } @@ -81,7 +96,7 @@ private static class HCatToRowFn extends DoFn { @ProcessElement public void processElement(ProcessContext c) { HCatRecord hCatRecord = c.element(); - c.output(Row.withSchema(schema).addValues(hCatRecord.getAll()).build()); + c.output(Row.withSchema(schema).addValues(castTypes(hCatRecord.getAll())).build()); } } } diff --git a/sdks/java/io/hcatalog/src/test/java/org/apache/beam/sdk/io/hcatalog/HCatalogIOTest.java b/sdks/java/io/hcatalog/src/test/java/org/apache/beam/sdk/io/hcatalog/HCatalogIOTest.java index 4bb7e1bd70441..3d97a2ccc1d98 100644 --- a/sdks/java/io/hcatalog/src/test/java/org/apache/beam/sdk/io/hcatalog/HCatalogIOTest.java +++ b/sdks/java/io/hcatalog/src/test/java/org/apache/beam/sdk/io/hcatalog/HCatalogIOTest.java @@ -22,6 +22,7 @@ import static org.apache.beam.sdk.io.hcatalog.test.HCatalogIOTestUtils.TEST_RECORDS_COUNT; import static org.apache.beam.sdk.io.hcatalog.test.HCatalogIOTestUtils.TEST_TABLE; import static org.apache.beam.sdk.io.hcatalog.test.HCatalogIOTestUtils.buildHCatRecords; +import static org.apache.beam.sdk.io.hcatalog.test.HCatalogIOTestUtils.buildHCatRecordsWithDate; import static org.apache.beam.sdk.io.hcatalog.test.HCatalogIOTestUtils.getConfigPropertiesAsMap; import static org.apache.beam.sdk.io.hcatalog.test.HCatalogIOTestUtils.getExpectedRecords; import static org.apache.beam.sdk.io.hcatalog.test.HCatalogIOTestUtils.getReaderContext; @@ -54,12 +55,14 @@ import org.apache.beam.sdk.testing.SourceTestUtils; import org.apache.beam.sdk.testing.TestPipeline; import org.apache.beam.sdk.transforms.Create; +import org.apache.beam.sdk.transforms.Distinct; import org.apache.beam.sdk.transforms.DoFn; import org.apache.beam.sdk.transforms.ParDo; import org.apache.beam.sdk.transforms.Watch; import org.apache.beam.sdk.util.SerializableUtils; import org.apache.beam.sdk.util.UserCodeException; import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.Row; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.apache.hadoop.hive.metastore.api.NoSuchObjectException; import org.apache.hive.hcatalog.data.DefaultHCatRecord; @@ -230,6 +233,44 @@ public void processElement(ProcessContext c) { readAfterWritePipeline.run(); } + /** Perform test for reading Date column type from an hcatalog. */ + @Test + public void testReadHCatalogDateType() throws Exception { + service.executeQuery("drop table if exists " + TEST_TABLE); + service.executeQuery("create table " + TEST_TABLE + "(mycol1 string, mycol2 date)"); + + defaultPipeline + .apply(Create.of(buildHCatRecordsWithDate(TEST_RECORDS_COUNT))) + .apply( + HCatalogIO.write() + .withConfigProperties(getConfigPropertiesAsMap(service.getHiveConf())) + .withDatabase(TEST_DATABASE) + .withTable(TEST_TABLE) + .withPartition(new java.util.HashMap<>())); + defaultPipeline.run().waitUntilFinish(); + + final PCollection output = + readAfterWritePipeline + .apply( + HCatToRow.fromSpec( + HCatalogIO.read() + .withConfigProperties(getConfigPropertiesAsMap(service.getHiveConf())) + .withDatabase(TEST_DATABASE) + .withTable(TEST_TABLE) + .withFilter(TEST_FILTER))) + .apply( + ParDo.of( + new DoFn() { + @ProcessElement + public void processElement(ProcessContext c) { + c.output(c.element().getDateTime("mycol2").toString("yyyy-MM-dd HH:mm:ss")); + } + })) + .apply(Distinct.create()); + PAssert.that(output).containsInAnyOrder(ImmutableList.of("2014-01-20 00:00:00")); + readAfterWritePipeline.run(); + } + /** Test of Write to a non-existent table. */ @Test public void testWriteFailureTableDoesNotExist() { diff --git a/sdks/java/io/hcatalog/src/test/java/org/apache/beam/sdk/io/hcatalog/test/HCatalogIOTestUtils.java b/sdks/java/io/hcatalog/src/test/java/org/apache/beam/sdk/io/hcatalog/test/HCatalogIOTestUtils.java index d0d1d850a6cbe..c09c2c906d649 100644 --- a/sdks/java/io/hcatalog/src/test/java/org/apache/beam/sdk/io/hcatalog/test/HCatalogIOTestUtils.java +++ b/sdks/java/io/hcatalog/src/test/java/org/apache/beam/sdk/io/hcatalog/test/HCatalogIOTestUtils.java @@ -26,6 +26,7 @@ import java.util.Map.Entry; import org.apache.beam.sdk.annotations.Internal; import org.apache.beam.sdk.values.KV; +import org.apache.hadoop.hive.common.type.Date; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hive.hcatalog.common.HCatException; import org.apache.hive.hcatalog.data.DefaultHCatRecord; @@ -120,4 +121,13 @@ public static Map getConfigPropertiesAsMap(HiveConf hiveConf) { private static DefaultHCatRecord toHCatRecord(int value) { return new DefaultHCatRecord(Arrays.asList("record " + value, value)); } + + /** Returns a list of HCatRecords of passed size with some dummy date as a field. */ + public static List buildHCatRecordsWithDate(int size) { + List expected = new ArrayList<>(); + for (int i = 0; i < size; i++) { + expected.add(new DefaultHCatRecord(Arrays.asList("record " + i, Date.valueOf("2014-01-20")))); + } + return expected; + } }

  • KeyTokenQueuedActive ForStateState Active For
    KeyTokenQueuedActive ForStateState Active ForProcessing Thread
    "); activeWorkStatus.append(elapsedString(activeWork.getStateStartTime(), now)); + activeWorkStatus.append(""); + activeWorkStatus.append(activeWork.getProcessingThreadName()); activeWorkStatus.append("