From 6aac47c8354eacaede1eb50c80288c36e734e6f6 Mon Sep 17 00:00:00 2001 From: lahariguduru Date: Thu, 1 Aug 2024 20:56:28 +0000 Subject: [PATCH 01/78] [CsvIO] Create CsvIOParse Class (#32028) * [CsvIO] Create CsvIOParse Class Co-authored-by: Francis O'Hara * rough draft * [CsvIO] Create CsvIOParse Class Co-authored-by: Francis O'Hara * Deleted changes made to CsvIOStringToRecord Class Co-authored-by: Francis O'Hara * [CsvIO] update tests for CsvIO for more coverage Co-authored-by: Francis O'Hara * added more tests for CsvIOParse Co-authored-by: Francis O'Hara * Added documentation for CsvIOParse Co-authored-by: Francis O'Hara --------- Co-authored-by: Francis O'Hara --- .../org/apache/beam/sdk/io/csv/CsvIO.java | 162 ++++++++++ .../apache/beam/sdk/io/csv/CsvIOParse.java | 84 +++++ .../sdk/io/csv/CsvIOStringToCsvRecord.java | 2 +- .../beam/sdk/io/csv/CsvIOParseTest.java | 197 ++++++++++++ .../io/csv/CsvIOStringToCsvRecordTest.java | 12 + .../org/apache/beam/sdk/io/csv/CsvIOTest.java | 300 ++++++++++++++++++ 6 files changed, 756 insertions(+), 1 deletion(-) create mode 100644 sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIOParse.java create mode 100644 sdks/java/io/csv/src/test/java/org/apache/beam/sdk/io/csv/CsvIOParseTest.java create mode 100644 sdks/java/io/csv/src/test/java/org/apache/beam/sdk/io/csv/CsvIOTest.java diff --git a/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIO.java b/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIO.java index 04141e5c677aa..fc2b68c0a8936 100644 --- a/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIO.java +++ b/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIO.java @@ -18,6 +18,7 @@ package org.apache.beam.sdk.io.csv; import static java.util.Objects.requireNonNull; +import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull; import static org.apache.beam.sdk.values.TypeDescriptors.rows; import static org.apache.beam.sdk.values.TypeDescriptors.strings; import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkArgument; @@ -35,8 +36,13 @@ import org.apache.beam.sdk.io.WriteFiles; import org.apache.beam.sdk.io.WriteFilesResult; import org.apache.beam.sdk.io.fs.ResourceId; +import org.apache.beam.sdk.schemas.AutoValueSchema; +import org.apache.beam.sdk.schemas.JavaBeanSchema; import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.schemas.Schema.FieldType; +import org.apache.beam.sdk.schemas.SchemaCoder; +import org.apache.beam.sdk.schemas.SchemaProvider; +import org.apache.beam.sdk.schemas.annotations.DefaultSchema; import org.apache.beam.sdk.transforms.MapElements; import org.apache.beam.sdk.transforms.PTransform; import org.apache.beam.sdk.transforms.SerializableFunction; @@ -44,6 +50,7 @@ import org.apache.beam.sdk.transforms.display.HasDisplayData; import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.Row; +import org.apache.beam.sdk.values.TypeDescriptor; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableSet; import org.apache.commons.csv.CSVFormat; @@ -340,6 +347,161 @@ public static Write writeRows(String to, CSVFormat csvFormat) { .build(); } + /** + * Instantiates a {@link CsvIOParse} for parsing CSV string records into custom {@link + * Schema}-mapped {@code Class}es from the records' assumed CsvFormat. + * See the Beam + * Programming Guide on how to configure your custom {@code Class} for Beam to infer its + * {@link Schema} using a {@link SchemaProvider} annotation such as {@link AutoValueSchema} or + * {@link JavaBeanSchema}. + * + *

Example usage

+ * + * The example below illustrates parsing CsvFormat#DEFAULT + * formatted CSV string records, read from {@link TextIO.Read}, into an {@link AutoValueSchema} + * annotated AutoValue data + * class {@link PCollection}. + * + *
{@code
+   * // SomeDataClass is a data class configured for Beam to automatically infer its Schema.
+   * @DefaultSchema(AutoValueSchema.class)
+   * @AutoValue
+   * abstract class SomeDataClass {
+   *
+   *    abstract String getSomeString();
+   *    abstract Integer getSomeInteger();
+   *
+   *    @AutoValue.Builder
+   *    abstract static class Builder {
+   *      abstract Builder setSomeString(String value);
+   *      abstract Builder setSomeInteger(Integer value);
+   *
+   *      abstract SomeDataClass build();
+   *    }
+   * }
+   *
+   * // Pipeline example reads CSV string records from Google Cloud storage and writes to BigQuery.
+   * Pipeline pipeline = Pipeline.create();
+   *
+   * // Read CSV records from Google Cloud storage using TextIO.
+   * PCollection csvRecords = pipeline
+   *  .apply(TextIO.read().from("gs://bucket/folder/*.csv");
+   *
+   * // Apply the CSV records PCollection to the CsvIOParse transform instantiated using CsvIO.parse.
+   * CsvIOParseResult result = csvRecords.apply(CsvIO.parse(
+   *      SomeDataClass.class,
+   *      CsvFormat.DEFAULT.withHeader("someString", "someInteger")
+   * ));
+   *
+   * // Acquire any processing errors to either write to logs or apply to a downstream dead letter queue such as BigQuery.
+   * result.getErrors().apply(BigQueryIO.write()
+   *  .to("project:dataset.table_of_errors")
+   *  .useBeamSchema()
+   *  .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
+   *  .withWriteDisposition(WriteDisposition.WRITE_APPEND));
+   *
+   * // Acquire the successful PCollection output.
+   * PCollection output = result.getOutput();
+   *
+   * // Do something with the output such as write to BigQuery.
+   * output.apply(BigQueryIO.write()
+   *  .to("project:dataset.table_of_output")
+   *  .useBeamSchema()
+   *  .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
+   *  .withWriteDisposition(WriteDisposition.WRITE_APPEND));
+   * }
+ */ + public static CsvIOParse parse(Class klass, CSVFormat csvFormat) { + CsvIOParseHelpers.validateCsvFormat(csvFormat); + SchemaProvider provider = new DefaultSchema.DefaultSchemaProvider(); + TypeDescriptor type = TypeDescriptor.of(klass); + Schema schema = + checkStateNotNull( + provider.schemaFor(type), + "Illegal %s: Schema could not be generated from given %s class", + Schema.class, + klass); + CsvIOParseHelpers.validateCsvFormatWithSchema(csvFormat, schema); + SerializableFunction fromRowFn = + checkStateNotNull( + provider.fromRowFunction(type), + "FromRowFn could not be generated from the given %s class", + klass); + SerializableFunction toRowFn = + checkStateNotNull( + provider.toRowFunction(type), + "ToRowFn could not be generated from the given %s class", + klass); + SchemaCoder coder = SchemaCoder.of(schema, type, toRowFn, fromRowFn); + CsvIOParseConfiguration.Builder builder = CsvIOParseConfiguration.builder(); + builder.setCsvFormat(csvFormat).setSchema(schema).setCoder(coder).setFromRowFn(fromRowFn); + return CsvIOParse.builder().setConfigBuilder(builder).build(); + } + + /** + * Instantiates a {@link CsvIOParse} for parsing CSV string records into {@link Row}s from the + * records' assumed CsvFormat + * and expected {@link Schema}. + * + *

Example usage

+ * + * The example below illustrates parsing CsvFormat#DEFAULT + * formatted CSV string records, read from {@link TextIO.Read}, into a {@link Row} {@link + * PCollection}. + * + *
{@code
+   * // Define the expected Schema.
+   * Schema schema = Schema.of(
+   *  Schema.Field.of("someString", FieldType.STRING),
+   *  Schema.Field.of("someInteger", FieldType.INT32)
+   * );
+   *
+   * // Pipeline example reads CSV string records from Google Cloud storage and writes to BigQuery.
+   * Pipeline pipeline = Pipeline.create();
+   *
+   * // Read CSV records from Google Cloud storage using TextIO.
+   * PCollection csvRecords = pipeline
+   *  .apply(TextIO.read().from("gs://bucket/folder/*.csv");
+   *
+   * // Apply the CSV records PCollection to the CsvIOParse transform instantiated using CsvIO.parseRows.
+   * CsvIOParseResult result = csvRecords.apply(CsvIO.parseRow(
+   *      schema,
+   *      CsvFormat.DEFAULT.withHeader("someString", "someInteger")
+   * ));
+   *
+   * // Acquire any processing errors to either write to logs or apply to a downstream dead letter queue such as BigQuery.
+   * result.getErrors().apply(BigQueryIO.write()
+   *  .to("project:dataset.table_of_errors")
+   *  .useBeamSchema()
+   *  .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
+   *  .withWriteDisposition(WriteDisposition.WRITE_APPEND));
+   *
+   * // Acquire the successful PCollection output.
+   * PCollection output = result.getOutput();
+   *
+   * // Do something with the output such as write to BigQuery.
+   * output.apply(BigQueryIO.write()
+   *  .to("project:dataset.table_of_output")
+   *  .useBeamSchema()
+   *  .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
+   *  .withWriteDisposition(WriteDisposition.WRITE_APPEND));
+   * }
+ */ + public static CsvIOParse parseRows(Schema schema, CSVFormat csvFormat) { + CsvIOParseHelpers.validateCsvFormat(csvFormat); + CsvIOParseHelpers.validateCsvFormatWithSchema(csvFormat, schema); + RowCoder coder = RowCoder.of(schema); + CsvIOParseConfiguration.Builder builder = CsvIOParseConfiguration.builder(); + builder.setCsvFormat(csvFormat).setSchema(schema).setCoder(coder).setFromRowFn(row -> row); + return CsvIOParse.builder().setConfigBuilder(builder).build(); + } + /** {@link PTransform} for writing CSV files. */ @AutoValue public abstract static class Write extends PTransform, WriteFilesResult> diff --git a/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIOParse.java b/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIOParse.java new file mode 100644 index 0000000000000..0a27cdbc57eca --- /dev/null +++ b/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIOParse.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.csv; + +import com.google.auto.value.AutoValue; +import java.util.List; +import java.util.Map; +import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.transforms.Flatten; +import org.apache.beam.sdk.transforms.PTransform; +import org.apache.beam.sdk.transforms.SerializableFunction; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.PCollectionList; +import org.apache.beam.sdk.values.PCollectionTuple; +import org.apache.beam.sdk.values.TupleTag; + +/** + * {@link PTransform} for Parsing CSV Record Strings into {@link Schema}-mapped target types. {@link + * CsvIOParse} is not instantiated directly but via {@link CsvIO#parse} or {@link CsvIO#parseRows}. + */ +@AutoValue +public abstract class CsvIOParse extends PTransform, CsvIOParseResult> { + + final TupleTag outputTag = new TupleTag() {}; + final TupleTag errorTag = new TupleTag() {}; + + static CsvIOParse.Builder builder() { + return new AutoValue_CsvIOParse.Builder<>(); + } + + // TODO(https://github.com/apache/beam/issues/31875): Implement in future PR. + public CsvIOParse withCustomRecordParsing( + Map> customProcessingMap) { + return this; + } + + /** Contains all configuration parameters for {@link CsvIOParse}. */ + abstract CsvIOParseConfiguration.Builder getConfigBuilder(); + + @AutoValue.Builder + abstract static class Builder { + abstract Builder setConfigBuilder(CsvIOParseConfiguration.Builder configBuilder); + + abstract CsvIOParse build(); + } + + @Override + public CsvIOParseResult expand(PCollection input) { + CsvIOParseConfiguration configuration = getConfigBuilder().build(); + + CsvIOStringToCsvRecord stringToCsvRecord = + new CsvIOStringToCsvRecord(configuration.getCsvFormat()); + CsvIOParseResult> stringToCsvRecordResult = input.apply(stringToCsvRecord); + PCollection> stringToRecordOutput = stringToCsvRecordResult.getOutput(); + PCollection stringToRecordErrors = stringToCsvRecordResult.getErrors(); + + CsvIORecordToObjects recordToObjects = new CsvIORecordToObjects(configuration); + CsvIOParseResult recordToObjectsResult = stringToRecordOutput.apply(recordToObjects); + PCollection output = recordToObjectsResult.getOutput(); + PCollection recordToObjectsErrors = recordToObjectsResult.getErrors(); + + PCollectionList errorList = + PCollectionList.of(stringToRecordErrors).and(recordToObjectsErrors); + PCollection errors = errorList.apply(Flatten.pCollections()); + + PCollectionTuple result = PCollectionTuple.of(outputTag, output).and(errorTag, errors); + return CsvIOParseResult.of(outputTag, configuration.getCoder(), errorTag, result); + } +} diff --git a/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIOStringToCsvRecord.java b/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIOStringToCsvRecord.java index 5fc4954cb450c..7fe0f5090d677 100644 --- a/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIOStringToCsvRecord.java +++ b/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIOStringToCsvRecord.java @@ -84,7 +84,7 @@ public void process(@Element String line, MultiOutputReceiver receiver) { for (CSVRecord record : csvParser.getRecords()) { receiver.get(outputTag).output(csvRecordtoList(record)); } - } catch (IOException e) { + } catch (RuntimeException | IOException e) { receiver .get(errorTag) .output( diff --git a/sdks/java/io/csv/src/test/java/org/apache/beam/sdk/io/csv/CsvIOParseTest.java b/sdks/java/io/csv/src/test/java/org/apache/beam/sdk/io/csv/CsvIOParseTest.java new file mode 100644 index 0000000000000..05d6982004f45 --- /dev/null +++ b/sdks/java/io/csv/src/test/java/org/apache/beam/sdk/io/csv/CsvIOParseTest.java @@ -0,0 +1,197 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.csv; + +import static org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.NULLABLE_ALL_PRIMITIVE_DATA_TYPES_SCHEMA; +import static org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.NULLABLE_ALL_PRIMITIVE_DATA_TYPES_TYPE_DESCRIPTOR; +import static org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.nullableAllPrimitiveDataTypes; +import static org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.nullableAllPrimitiveDataTypesFromRowFn; +import static org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.nullableAllPrimitiveDataTypesToRowFn; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.coders.Coder; +import org.apache.beam.sdk.coders.NullableCoder; +import org.apache.beam.sdk.coders.RowCoder; +import org.apache.beam.sdk.coders.StringUtf8Coder; +import org.apache.beam.sdk.io.common.SchemaAwareJavaBeans; +import org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.NullableAllPrimitiveDataTypes; +import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.schemas.SchemaCoder; +import org.apache.beam.sdk.testing.PAssert; +import org.apache.beam.sdk.testing.TestPipeline; +import org.apache.beam.sdk.transforms.Create; +import org.apache.beam.sdk.transforms.SerializableFunction; +import org.apache.beam.sdk.util.SerializableUtils; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.Row; +import org.apache.commons.csv.CSVFormat; +import org.junit.Rule; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class CsvIOParseTest { + + private static final String[] HEADER = + new String[] {"aBoolean", "aDouble", "aFloat", "anInteger", "aLong", "aString"}; + private static final Coder + NULLABLE_ALL_PRIMITIVE_DATA_TYPES_CODER = + SchemaCoder.of( + NULLABLE_ALL_PRIMITIVE_DATA_TYPES_SCHEMA, + NULLABLE_ALL_PRIMITIVE_DATA_TYPES_TYPE_DESCRIPTOR, + nullableAllPrimitiveDataTypesToRowFn(), + nullableAllPrimitiveDataTypesFromRowFn()); + private static final SerializableFunction ROW_ROW_SERIALIZABLE_FUNCTION = row -> row; + @Rule public final TestPipeline pipeline = TestPipeline.create(); + + @Test + public void isSerializable() throws Exception { + SerializableUtils.ensureSerializable(CsvIOParse.class); + } + + @Test + public void parseRows() { + PCollection records = + csvRecords( + pipeline, + "# This is a comment", + "aBoolean,aDouble,aFloat,anInteger,aLong,aString", + "true,1.0,2.0,3,4,foo", + "🏵,6.0,7.0,8,9,bar", + "false,12.0,14.0,8,24,\"foo\nbar\"", + "true,1.0,2.0,3,4,foo$,bar"); + List want = + Arrays.asList( + Row.withSchema(NULLABLE_ALL_PRIMITIVE_DATA_TYPES_SCHEMA) + .withFieldValue("aBoolean", true) + .withFieldValue("aDouble", 1.0) + .withFieldValue("aFloat", 2.0f) + .withFieldValue("anInteger", 3) + .withFieldValue("aLong", 4L) + .withFieldValue("aString", "foo") + .build(), + Row.withSchema(NULLABLE_ALL_PRIMITIVE_DATA_TYPES_SCHEMA) + .withFieldValue("aBoolean", null) + .withFieldValue("aDouble", 6.0) + .withFieldValue("aFloat", 7.0f) + .withFieldValue("anInteger", 8) + .withFieldValue("aLong", 9L) + .withFieldValue("aString", "bar") + .build(), + Row.withSchema(NULLABLE_ALL_PRIMITIVE_DATA_TYPES_SCHEMA) + .withFieldValue("aBoolean", false) + .withFieldValue("aDouble", 12.0) + .withFieldValue("aFloat", 14.0f) + .withFieldValue("anInteger", 8) + .withFieldValue("aLong", 24L) + .withFieldValue("aString", "foo\nbar") + .build(), + Row.withSchema(NULLABLE_ALL_PRIMITIVE_DATA_TYPES_SCHEMA) + .withFieldValue("aBoolean", true) + .withFieldValue("aDouble", 1.0) + .withFieldValue("aFloat", 2.0f) + .withFieldValue("anInteger", 3) + .withFieldValue("aLong", 4L) + .withFieldValue("aString", "foo,bar") + .build()); + + CsvIOParseResult result = + records.apply( + underTest( + NULLABLE_ALL_PRIMITIVE_DATA_TYPES_SCHEMA, + csvFormat(), + emptyCustomProcessingMap(), + ROW_ROW_SERIALIZABLE_FUNCTION, + RowCoder.of(NULLABLE_ALL_PRIMITIVE_DATA_TYPES_SCHEMA))); + PAssert.that(result.getOutput()).containsInAnyOrder(want); + PAssert.that(result.getErrors()).empty(); + + pipeline.run(); + } + + @Test + public void parsePOJOs() { + PCollection records = + csvRecords( + pipeline, + "# This is a comment", + "aBoolean,aDouble,aFloat,anInteger,aLong,aString", + "true,1.0,2.0,3,4,foo", + "🏵,6.0,7.0,8,9,bar", + "false,12.0,14.0,8,24,\"foo\nbar\"", + "true,1.0,2.0,3,4,foo$,bar"); + List want = + Arrays.asList( + nullableAllPrimitiveDataTypes(true, 1.0d, 2.0f, 3, 4L, "foo"), + nullableAllPrimitiveDataTypes(null, 6.0d, 7.0f, 8, 9L, "bar"), + nullableAllPrimitiveDataTypes(false, 12.0d, 14.0f, 8, 24L, "foo\nbar"), + nullableAllPrimitiveDataTypes(true, 1.0d, 2.0f, 3, 4L, "foo,bar")); + + CsvIOParseResult result = + records.apply( + underTest( + NULLABLE_ALL_PRIMITIVE_DATA_TYPES_SCHEMA, + csvFormat(), + emptyCustomProcessingMap(), + nullableAllPrimitiveDataTypesFromRowFn(), + NULLABLE_ALL_PRIMITIVE_DATA_TYPES_CODER)); + PAssert.that(result.getOutput()).containsInAnyOrder(want); + PAssert.that(result.getErrors()).empty(); + + pipeline.run(); + } + + private static CSVFormat csvFormat() { + return CSVFormat.DEFAULT + .withAllowDuplicateHeaderNames(false) + .withHeader(HEADER) + .withCommentMarker('#') + .withNullString("🏵") + .withEscape('$'); + } + + private static PCollection csvRecords(Pipeline pipeline, String... lines) { + return pipeline.apply( + Create.of(Arrays.asList(lines)).withCoder(NullableCoder.of(StringUtf8Coder.of()))); + } + + private static CsvIOParse underTest( + Schema schema, + CSVFormat csvFormat, + Map> customProcessingMap, + SerializableFunction fromRowFn, + Coder coder) { + CsvIOParseConfiguration.Builder configBuilder = + CsvIOParseConfiguration.builder() + .setSchema(schema) + .setCsvFormat(csvFormat) + .setCustomProcessingMap(customProcessingMap) + .setFromRowFn(fromRowFn) + .setCoder(coder); + return CsvIOParse.builder().setConfigBuilder(configBuilder).build(); + } + + private static Map> emptyCustomProcessingMap() { + return new HashMap<>(); + } +} diff --git a/sdks/java/io/csv/src/test/java/org/apache/beam/sdk/io/csv/CsvIOStringToCsvRecordTest.java b/sdks/java/io/csv/src/test/java/org/apache/beam/sdk/io/csv/CsvIOStringToCsvRecordTest.java index 7cbba3335dd28..1618962ef394b 100644 --- a/sdks/java/io/csv/src/test/java/org/apache/beam/sdk/io/csv/CsvIOStringToCsvRecordTest.java +++ b/sdks/java/io/csv/src/test/java/org/apache/beam/sdk/io/csv/CsvIOStringToCsvRecordTest.java @@ -24,6 +24,7 @@ import java.util.List; import org.apache.beam.sdk.testing.PAssert; import org.apache.beam.sdk.testing.TestPipeline; +import org.apache.beam.sdk.transforms.Count; import org.apache.beam.sdk.transforms.Create; import org.apache.beam.sdk.values.PCollection; import org.apache.commons.csv.CSVFormat; @@ -547,6 +548,17 @@ public void testMultiLineCsvRecord() { pipeline.run(); } + @Test + public void givenInvalidCsvRecord_throws() { + CSVFormat csvFormat = csvFormat().withQuote('"'); + PCollection input = + pipeline.apply(Create.of(headerLine(csvFormat), "a,\"1,1.1", "b,2,2.2", "c,3,3.3")); + CsvIOStringToCsvRecord underTest = new CsvIOStringToCsvRecord(csvFormat); + CsvIOParseResult> result = input.apply(underTest); + PAssert.thatSingleton(result.getErrors().apply(Count.globally())).isEqualTo(1L); + pipeline.run(); + } + private static CSVFormat csvFormat() { return CSVFormat.DEFAULT.withAllowDuplicateHeaderNames(false).withHeader(header); } diff --git a/sdks/java/io/csv/src/test/java/org/apache/beam/sdk/io/csv/CsvIOTest.java b/sdks/java/io/csv/src/test/java/org/apache/beam/sdk/io/csv/CsvIOTest.java new file mode 100644 index 0000000000000..13e09725e952d --- /dev/null +++ b/sdks/java/io/csv/src/test/java/org/apache/beam/sdk/io/csv/CsvIOTest.java @@ -0,0 +1,300 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.csv; + +import static org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.NULLABLE_ALL_PRIMITIVE_DATA_TYPES_SCHEMA; +import static org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.nullableAllPrimitiveDataTypes; +import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull; +import static org.junit.Assert.assertThrows; + +import java.io.Serializable; +import java.util.Arrays; +import java.util.List; +import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.coders.NullableCoder; +import org.apache.beam.sdk.coders.StringUtf8Coder; +import org.apache.beam.sdk.io.common.SchemaAwareJavaBeans; +import org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.NullableAllPrimitiveDataTypes; +import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.testing.PAssert; +import org.apache.beam.sdk.transforms.Count; +import org.apache.beam.sdk.transforms.Create; +import org.apache.beam.sdk.transforms.Filter; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.Row; +import org.apache.commons.csv.CSVFormat; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class CsvIOTest { + private static final String[] HEADER = + new String[] {"aBoolean", "aDouble", "aFloat", "anInteger", "aLong", "aString"}; + + @Test + public void parseRows() { + Pipeline pipeline = Pipeline.create(); + PCollection input = + csvRecords( + pipeline, + "# This is a comment", + "aBoolean,aDouble,aFloat,anInteger,aLong,aString", + "true,1.0,2.0,3,4,foo", + "N/A,6.0,7.0,8,9,bar", + "false,12.0,14.0,8,24,\"foo\nbar\"", + "true,1.0,2.0,3,4,foo$,bar"); + List want = + Arrays.asList( + Row.withSchema(NULLABLE_ALL_PRIMITIVE_DATA_TYPES_SCHEMA) + .withFieldValue("aBoolean", true) + .withFieldValue("aDouble", 1.0) + .withFieldValue("aFloat", 2.0f) + .withFieldValue("anInteger", 3) + .withFieldValue("aLong", 4L) + .withFieldValue("aString", "foo") + .build(), + Row.withSchema(NULLABLE_ALL_PRIMITIVE_DATA_TYPES_SCHEMA) + .withFieldValue("aBoolean", null) + .withFieldValue("aDouble", 6.0) + .withFieldValue("aFloat", 7.0f) + .withFieldValue("anInteger", 8) + .withFieldValue("aLong", 9L) + .withFieldValue("aString", "bar") + .build(), + Row.withSchema(NULLABLE_ALL_PRIMITIVE_DATA_TYPES_SCHEMA) + .withFieldValue("aBoolean", false) + .withFieldValue("aDouble", 12.0) + .withFieldValue("aFloat", 14.0f) + .withFieldValue("anInteger", 8) + .withFieldValue("aLong", 24L) + .withFieldValue("aString", "foo\nbar") + .build(), + Row.withSchema(NULLABLE_ALL_PRIMITIVE_DATA_TYPES_SCHEMA) + .withFieldValue("aBoolean", true) + .withFieldValue("aDouble", 1.0) + .withFieldValue("aFloat", 2.0f) + .withFieldValue("anInteger", 3) + .withFieldValue("aLong", 4L) + .withFieldValue("aString", "foo,bar") + .build()); + + CsvIOParse underTest = + CsvIO.parseRows(NULLABLE_ALL_PRIMITIVE_DATA_TYPES_SCHEMA, csvFormat()); + CsvIOParseResult result = input.apply(underTest); + PAssert.that(result.getOutput()).containsInAnyOrder(want); + PAssert.that(result.getErrors()).empty(); + + pipeline.run(); + } + + @Test + public void parsesPOJOs() { + Pipeline pipeline = Pipeline.create(); + PCollection input = + csvRecords( + pipeline, + "# This is a comment", + "aBoolean,aDouble,aFloat,anInteger,aLong,aString", + "true,1.0,2.0,3,4,foo", + "N/A,6.0,7.0,8,9,bar", + "false,12.0,14.0,8,24,\"foo\nbar\"", + "true,1.0,2.0,3,4,foo$,bar"); + List want = + Arrays.asList( + nullableAllPrimitiveDataTypes(true, 1.0d, 2.0f, 3, 4L, "foo"), + nullableAllPrimitiveDataTypes(null, 6.0d, 7.0f, 8, 9L, "bar"), + nullableAllPrimitiveDataTypes(false, 12.0d, 14.0f, 8, 24L, "foo\nbar"), + nullableAllPrimitiveDataTypes(true, 1.0d, 2.0f, 3, 4L, "foo,bar")); + + CsvIOParse underTest = + CsvIO.parse(NullableAllPrimitiveDataTypes.class, csvFormat()); + CsvIOParseResult result = input.apply(underTest); + PAssert.that(result.getOutput()).containsInAnyOrder(want); + PAssert.that(result.getErrors()).empty(); + + pipeline.run(); + } + + @Test + public void givenInvalidCsvFormat_throws() { + Pipeline pipeline = Pipeline.create(); + CSVFormat csvFormat = + CSVFormat.DEFAULT + .withHeader("a_string", "an_integer", "a_double") + .withAllowDuplicateHeaderNames(true); + Schema schema = + Schema.builder() + .addStringField("a_string") + .addInt32Field("an_integer") + .addDoubleField("a_double") + .build(); + assertThrows(IllegalArgumentException.class, () -> CsvIO.parseRows(schema, csvFormat)); + pipeline.run(); + } + + @Test + public void givenMismatchedCsvFormatAndSchema_throws() { + Pipeline pipeline = Pipeline.create(); + CSVFormat csvFormat = + CSVFormat.DEFAULT + .withHeader("a_string", "an_integer", "a_double") + .withAllowDuplicateHeaderNames(true); + Schema schema = Schema.builder().addStringField("a_string").addDoubleField("a_double").build(); + assertThrows(IllegalArgumentException.class, () -> CsvIO.parseRows(schema, csvFormat)); + pipeline.run(); + } + + @Test + public void givenNullSchema_throws() { + Pipeline pipeline = Pipeline.create(); + assertThrows(NullPointerException.class, () -> CsvIO.parseRows(null, csvFormat())); + pipeline.run(); + } + + @Test + public void givenNonSchemaMappedClass_throws() { + Pipeline pipeline = Pipeline.create(); + CSVFormat csvFormat = + CSVFormat.DEFAULT + .withHeader("a_string", "an_integer", "a_double") + .withAllowDuplicateHeaderNames(false); + assertThrows( + IllegalStateException.class, () -> CsvIO.parse(NonSchemaMappedPojo.class, csvFormat)); + pipeline.run(); + } + + @Test + public void givenStringToRecordError_emits() { + Pipeline pipeline = Pipeline.create(); + PCollection input = pipeline.apply(Create.of("true,\"1.1,3.141592,1,5,foo")); + Schema schema = + Schema.builder() + .addBooleanField("aBoolean") + .addDoubleField("aDouble") + .addFloatField("aFloat") + .addInt32Field("anInteger") + .addInt64Field("aLong") + .addStringField("aString") + .build(); + CsvIOParse underTest = CsvIO.parseRows(schema, csvFormat().withQuote('"')); + CsvIOParseResult result = input.apply(underTest); + PAssert.thatSingleton(result.getErrors().apply("Total Errors", Count.globally())).isEqualTo(1L); + PAssert.thatSingleton( + stackTraceContains(result.getErrors(), CsvIOStringToCsvRecord.class.getName())) + .isEqualTo(1L); + + pipeline.run(); + } + + @Test + public void givenRecordToObjectError_emits() { + Pipeline pipeline = Pipeline.create(); + PCollection input = + pipeline.apply(Create.of("true,1.1,3.141592,this_is_an_error,5,foo")); + Schema schema = + Schema.builder() + .addBooleanField("aBoolean") + .addDoubleField("aDouble") + .addFloatField("aFloat") + .addInt32Field("anInteger") + .addInt64Field("aLong") + .addStringField("aString") + .build(); + CsvIOParse underTest = CsvIO.parseRows(schema, csvFormat().withQuote('"')); + CsvIOParseResult result = input.apply(underTest); + PAssert.thatSingleton(result.getErrors().apply(Count.globally())).isEqualTo(1L); + PAssert.thatSingleton( + stackTraceContains(result.getErrors(), CsvIORecordToObjects.class.getName())) + .isEqualTo(1L); + pipeline.run(); + } + + @Test + public void givenStringToRecordError_RecordToObjectError_emits() { + Pipeline pipeline = Pipeline.create(); + PCollection input = + pipeline.apply( + Create.of("true,\"1.1,3.141592,1,5,foo", "true,1.1,3.141592,this_is_an_error,5,foo")); + Schema schema = + Schema.builder() + .addBooleanField("aBoolean") + .addDoubleField("aDouble") + .addFloatField("aFloat") + .addInt32Field("anInteger") + .addInt64Field("aLong") + .addStringField("aString") + .build(); + CsvIOParse underTest = CsvIO.parseRows(schema, csvFormat().withQuote('"')); + CsvIOParseResult result = input.apply(underTest); + PAssert.thatSingleton(result.getErrors().apply(Count.globally())).isEqualTo(2L); + PAssert.thatSingleton( + stackTraceContains(result.getErrors(), CsvIOStringToCsvRecord.class.getName())) + .isEqualTo(1L); + PAssert.thatSingleton( + stackTraceContains(result.getErrors(), CsvIORecordToObjects.class.getName())) + .isEqualTo(1L); + + pipeline.run(); + } + + private static PCollection stackTraceContains( + PCollection errors, String match) { + return errors + .apply(match, Filter.by(input -> checkStateNotNull(input).getStackTrace().contains(match))) + .apply(match, Count.globally()); + } + + private static CSVFormat csvFormat() { + return CSVFormat.DEFAULT + .withAllowDuplicateHeaderNames(false) + .withHeader(HEADER) + .withCommentMarker('#') + .withNullString("N/A") + .withEscape('$'); + } + + private static PCollection csvRecords(Pipeline pipeline, String... lines) { + return pipeline.apply( + Create.of(Arrays.asList(lines)).withCoder(NullableCoder.of(StringUtf8Coder.of()))); + } + + private static class NonSchemaMappedPojo implements Serializable { + private final String aString; + private final Integer anInteger; + private final Double aDouble; + + private NonSchemaMappedPojo(String aString, Integer anInteger, Double aDouble) { + this.aString = aString; + this.anInteger = anInteger; + this.aDouble = aDouble; + } + + public String getAString() { + return aString; + } + + public Integer getAnInteger() { + return anInteger; + } + + public Double getADouble() { + return aDouble; + } + } +} From b795a61f09432f6ead573f15aa99436865565ec8 Mon Sep 17 00:00:00 2001 From: Andrew Crites Date: Thu, 1 Aug 2024 16:01:35 -0700 Subject: [PATCH 02/78] Adds null checks when accessing OperationalLimits in config since they might not have been set yet. (#32053) --- .../config/StreamingEngineComputationConfigFetcher.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/config/StreamingEngineComputationConfigFetcher.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/config/StreamingEngineComputationConfigFetcher.java index 850e8c3f24bdc..d230aac54c636 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/config/StreamingEngineComputationConfigFetcher.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/config/StreamingEngineComputationConfigFetcher.java @@ -188,11 +188,13 @@ private StreamingEnginePipelineConfig createPipelineConfig(StreamingConfigTask c } if (config.getOperationalLimits() != null) { - if (config.getOperationalLimits().getMaxKeyBytes() > 0 + if (config.getOperationalLimits().getMaxKeyBytes() != null + && config.getOperationalLimits().getMaxKeyBytes() > 0 && config.getOperationalLimits().getMaxKeyBytes() <= Integer.MAX_VALUE) { pipelineConfig.setMaxOutputKeyBytes(config.getOperationalLimits().getMaxKeyBytes()); } - if (config.getOperationalLimits().getMaxProductionOutputBytes() > 0 + if (config.getOperationalLimits().getMaxProductionOutputBytes() != null + && config.getOperationalLimits().getMaxProductionOutputBytes() > 0 && config.getOperationalLimits().getMaxProductionOutputBytes() <= Integer.MAX_VALUE) { pipelineConfig.setMaxOutputValueBytes( config.getOperationalLimits().getMaxProductionOutputBytes()); From 202fa56be771167495cbcc3fd84e60fcc77147c9 Mon Sep 17 00:00:00 2001 From: Damon Date: Thu, 1 Aug 2024 18:00:17 -0700 Subject: [PATCH 03/78] Enable ExternalWorkerService during Prism Runner lifecycle (#32057) --- runners/prism/java/build.gradle | 2 + .../beam/runners/prism/WorkerService.java | 116 ++++++++++++++++++ .../beam/runners/prism/WorkerServiceTest.java | 85 +++++++++++++ 3 files changed, 203 insertions(+) create mode 100644 runners/prism/java/src/main/java/org/apache/beam/runners/prism/WorkerService.java create mode 100644 runners/prism/java/src/test/java/org/apache/beam/runners/prism/WorkerServiceTest.java diff --git a/runners/prism/java/build.gradle b/runners/prism/java/build.gradle index 23f4a024569b4..2b0635ca61255 100644 --- a/runners/prism/java/build.gradle +++ b/runners/prism/java/build.gradle @@ -27,7 +27,9 @@ ext.summary = "Support for executing a pipeline on Prism." dependencies { implementation project(path: ":model:job-management", configuration: "shadow") + implementation project(path: ":model:pipeline", configuration: "shadow") implementation project(path: ":sdks:java:core", configuration: "shadow") + implementation project(path: ":sdks:java:harness", configuration: "shadow") implementation project(":runners:portability:java") implementation library.java.joda_time diff --git a/runners/prism/java/src/main/java/org/apache/beam/runners/prism/WorkerService.java b/runners/prism/java/src/main/java/org/apache/beam/runners/prism/WorkerService.java new file mode 100644 index 0000000000000..289ffac64f8a4 --- /dev/null +++ b/runners/prism/java/src/main/java/org/apache/beam/runners/prism/WorkerService.java @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.prism; + +import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull; + +import org.apache.beam.fn.harness.ExternalWorkerService; +import org.apache.beam.model.pipeline.v1.Endpoints; +import org.apache.beam.sdk.PipelineResult; +import org.apache.beam.sdk.fn.server.GrpcFnServer; +import org.apache.beam.sdk.options.PortablePipelineOptions; +import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.Server; +import org.checkerframework.checker.nullness.qual.MonotonicNonNull; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * An {@link ExternalWorkerService} {@link GrpcFnServer} encapsulation that {@link #stop}s when + * {@link StateListener#onStateChanged} is invoked with a {@link PipelineResult.State} that is + * {@link PipelineResult.State#isTerminal}. + */ +class WorkerService implements StateListener { + + private static final Logger LOG = LoggerFactory.getLogger(WorkerService.class); + + private final ExternalWorkerService worker; + private @MonotonicNonNull GrpcFnServer server; + + WorkerService(PortablePipelineOptions options) { + this.worker = new ExternalWorkerService(options); + } + + /** Start the {@link ExternalWorkerService}. */ + void start() throws Exception { + if (server != null && !server.getServer().isShutdown()) { + return; + } + + server = worker.start(); + LOG.info("Starting worker service at {}", getApiServiceDescriptorUrl()); + } + + /** + * Queries whether the {@link ExternalWorkerService} {@link GrpcFnServer}'s {@link Server} is + * running. + */ + boolean isRunning() { + if (server == null) { + return false; + } + return !server.getServer().isShutdown(); + } + + /** + * Queries the {@link Endpoints.ApiServiceDescriptor#getUrl} of the {@link ExternalWorkerService} + * {@link GrpcFnServer}'s {@link Server}. Throws an exception if the {@link WorkerService} has not + * {@link WorkerService#start}ed. + */ + String getApiServiceDescriptorUrl() { + return checkStateNotNull(server, "worker service not started") + .getApiServiceDescriptor() + .getUrl(); + } + + /** + * Updates {@link PortablePipelineOptions#getDefaultEnvironmentConfig} with {@link + * #getApiServiceDescriptorUrl}. Throws an exception if the {@link WorkerService} has not {@link + * WorkerService#start}ed. + */ + PortablePipelineOptions updateDefaultEnvironmentConfig(PortablePipelineOptions options) { + options.setDefaultEnvironmentConfig(getApiServiceDescriptorUrl()); + return options; + } + + /** + * Overrides {@link StateListener#onStateChanged}, invoking {@link #stop} when {@link + * PipelineResult.State#isTerminal}. + */ + @Override + public void onStateChanged(PipelineResult.State state) { + if (state.isTerminal()) { + stop(); + } + } + + /** + * Stops the {@link ExternalWorkerService} {@link GrpcFnServer}'s {@link Server}. If not {@link + * WorkerService#isRunning()}, then calling stop is a noop. + */ + void stop() { + if (server == null || server.getServer().isShutdown()) { + return; + } + LOG.info("Stopping worker service at {}", getApiServiceDescriptorUrl()); + try { + server.close(); + } catch (Exception e) { + throw new RuntimeException(e); + } + } +} diff --git a/runners/prism/java/src/test/java/org/apache/beam/runners/prism/WorkerServiceTest.java b/runners/prism/java/src/test/java/org/apache/beam/runners/prism/WorkerServiceTest.java new file mode 100644 index 0000000000000..7fc05d7747cd6 --- /dev/null +++ b/runners/prism/java/src/test/java/org/apache/beam/runners/prism/WorkerServiceTest.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.prism; + +import static com.google.common.truth.Truth.assertThat; +import static org.junit.Assert.assertThrows; + +import org.apache.beam.sdk.PipelineResult; +import org.apache.beam.sdk.options.PipelineOptionsFactory; +import org.apache.beam.sdk.options.PortablePipelineOptions; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +/** Tests for {@link WorkerService}. */ +@RunWith(JUnit4.class) +public class WorkerServiceTest { + @Test + public void testStartStop() throws Exception { + PortablePipelineOptions options = + PipelineOptionsFactory.create().as(PortablePipelineOptions.class); + WorkerService underTest = new WorkerService(options); + underTest.start(); + assertThat(underTest.isRunning()).isTrue(); + assertThat(underTest.getApiServiceDescriptorUrl()).matches("localhost:\\d+"); + underTest.stop(); + assertThat(underTest.isRunning()).isFalse(); + } + + @Test + public void givenStarted_updateDefaultEnvironmentConfig() throws Exception { + PortablePipelineOptions options = + PipelineOptionsFactory.create().as(PortablePipelineOptions.class); + assertThat(options.getDefaultEnvironmentConfig()).isNull(); + WorkerService underTest = new WorkerService(options); + underTest.start(); + options = underTest.updateDefaultEnvironmentConfig(options); + assertThat(options.getDefaultEnvironmentConfig()) + .isEqualTo(underTest.getApiServiceDescriptorUrl()); + underTest.stop(); + } + + @Test + public void givenNotStarted_updateDefaultEnvironmentConfig_throws() { + PortablePipelineOptions options = + PipelineOptionsFactory.create().as(PortablePipelineOptions.class); + WorkerService underTest = new WorkerService(options); + assertThrows( + IllegalStateException.class, () -> underTest.updateDefaultEnvironmentConfig(options)); + } + + @Test + public void whenStateIsTerminal_thenStop() throws Exception { + PortablePipelineOptions options = + PipelineOptionsFactory.create().as(PortablePipelineOptions.class); + WorkerService underTest = new WorkerService(options); + assertThat(underTest.isRunning()).isFalse(); + underTest.start(); + assertThat(underTest.isRunning()).isTrue(); + + underTest.onStateChanged(PipelineResult.State.RUNNING); + assertThat(underTest.isRunning()).isTrue(); + + underTest.onStateChanged(PipelineResult.State.RUNNING); + assertThat(underTest.isRunning()).isTrue(); + + underTest.onStateChanged(PipelineResult.State.CANCELLED); + assertThat(underTest.isRunning()).isFalse(); + } +} From 0b4b8ea9423dce255b38f2e2533307b1930fbd13 Mon Sep 17 00:00:00 2001 From: Yi Hu Date: Fri, 2 Aug 2024 09:52:16 -0400 Subject: [PATCH 04/78] Handle rc container in _update_container_image_for_dataflow (#32049) --- .../runners/dataflow/internal/apiclient.py | 6 +++ .../dataflow/internal/apiclient_test.py | 37 +++++++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/sdks/python/apache_beam/runners/dataflow/internal/apiclient.py b/sdks/python/apache_beam/runners/dataflow/internal/apiclient.py index badc3683bb285..20cae582f320d 100644 --- a/sdks/python/apache_beam/runners/dataflow/internal/apiclient.py +++ b/sdks/python/apache_beam/runners/dataflow/internal/apiclient.py @@ -733,6 +733,12 @@ def _update_container_image_for_dataflow(beam_container_image_url): # By default Dataflow pipelines use containers hosted in Dataflow GCR # instead of Docker Hub. image_suffix = beam_container_image_url.rsplit('/', 1)[1] + + # trim "RCX" as release candidate tag exists on Docker Hub but not GCR + check_rc = image_suffix.lower().split('rc') + if len(check_rc) == 2: + image_suffix = image_suffix[:-2 - len(check_rc[1])] + return names.DATAFLOW_CONTAINER_IMAGE_REPOSITORY + '/' + image_suffix @staticmethod diff --git a/sdks/python/apache_beam/runners/dataflow/internal/apiclient_test.py b/sdks/python/apache_beam/runners/dataflow/internal/apiclient_test.py index a4e1a5253a736..8331d9cf3919a 100644 --- a/sdks/python/apache_beam/runners/dataflow/internal/apiclient_test.py +++ b/sdks/python/apache_beam/runners/dataflow/internal/apiclient_test.py @@ -321,6 +321,43 @@ def test_dataflow_container_image_override_prime(self): self._verify_dataflow_container_image_override(pipeline_options) + def _verify_dataflow_container_image_override_rc(self, pipeline_options): + pipeline = Pipeline(options=pipeline_options) + pipeline | Create([1, 2, 3]) | ParDo(DoFn()) # pylint:disable=expression-not-assigned + + dummy_env = DockerEnvironment( + container_image='apache/beam_dummy_name:2.00.0RC10') + proto_pipeline, _ = pipeline.to_runner_api( + return_context=True, default_environment=dummy_env) + + # Accessing non-public method for testing. + apiclient.DataflowApplicationClient._apply_sdk_environment_overrides( + proto_pipeline, {}, pipeline_options) + + from apache_beam.utils import proto_utils + found_override = False + trimed_rc = True + for env in proto_pipeline.components.environments.values(): + docker_payload = proto_utils.parse_Bytes( + env.payload, beam_runner_api_pb2.DockerPayload) + if docker_payload.container_image.startswith( + names.DATAFLOW_CONTAINER_IMAGE_REPOSITORY): + found_override = True + if docker_payload.container_image.split(':')[-1] != '2.00.0': + trimed_rc = False + + self.assertTrue(found_override) + self.assertTrue(trimed_rc) + + def test_dataflow_container_image_override_rc(self): + pipeline_options = PipelineOptions([ + '--experiments=use_runner_v2', + '--temp_location', + 'gs://any-location/temp' + ]) + + self._verify_dataflow_container_image_override_rc(pipeline_options) + def _verify_non_apache_container_not_overridden(self, pipeline_options): pipeline = Pipeline(options=pipeline_options) pipeline | Create([1, 2, 3]) | ParDo(DoFn()) # pylint:disable=expression-not-assigned From d96fa7d4009a9638e0cdfc8b107710eaf8362f00 Mon Sep 17 00:00:00 2001 From: Danny McCormick Date: Fri, 2 Aug 2024 17:04:40 +0200 Subject: [PATCH 05/78] Add some large model troubleshooting steps (#31862) --- .../ml/large-language-modeling.md | 50 ++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/website/www/site/content/en/documentation/ml/large-language-modeling.md b/website/www/site/content/en/documentation/ml/large-language-modeling.md index 79ef58e6de319..90bbd43383c06 100644 --- a/website/www/site/content/en/documentation/ml/large-language-modeling.md +++ b/website/www/site/content/en/documentation/ml/large-language-modeling.md @@ -27,7 +27,7 @@ RunInference has several mechanisms for reducing memory utilization. For example Many Beam runners, however, run multiple Beam processes per machine at once. This can cause problems since the memory footprint of loading large models like LLMs multiple times can be too large to fit into a single machine. For memory-intensive models, RunInference provides a mechanism for more intelligently sharing memory across multiple processes to reduce the overall memory footprint. To enable this mode, users just have -to set the parameter `large_model` to True in their model configuration (see below for an example), and Beam will take care of the memory management. +to set the parameter `large_model` to True in their model configuration (see below for an example), and Beam will take care of the memory management. When using a custom model handler, you can override the `share_model_across_processes` function or the `model_copies` function for a similar effect. ### Running an Example Pipeline with T5 @@ -122,3 +122,51 @@ A `ModelHandler` requires parameters like: * `device` – The device on which you wish to run the model. If device = GPU then a GPU device will be used if it is available. Otherwise, it will be CPU. * `inference_fn` - The inference function to use during RunInference. * `large_model` - (see `Memory Management` above). Whether to use memory minimization techniques to lower the memory footprint of your model. + +### Troubleshooting Large Models + +#### Pickling errors + +When sharing a model across processes with `large_model=True` or using a custom model handler, Beam sends the input and output data across a process boundary. +To do this, it uses a serialization method known as [pickling](https://docs.python.org/3/library/pickle.html). +For example, if you call `output=model.my_inference_fn(input_1, input_2)`, `input_1`, `input_2`, and `output` will all need to be pickled. +The model itself does not need to be pickled since it is not passed across process boundaries. + +While most objects can be pickled without issue, if one of these objects is unpickleable you may run into errors like `error: can't pickle fasttext_pybind.fasttext objects`. +To work around this, there are a few options: + +First of all, if possible you can choose not to share your model across processes. This will incur additional memory pressure, but it may be tolerable in some cases. + +Second, using a custom model handler you can wrap your model to take in and return serializable types. For example, if your model handler looks like: + +``` +class MyModelHandler(): + def load_model(self): + return model_loading_logic() + + def run_inference(self, batch: Sequence[str], model, inference_args): + unpickleable_object = Unpickleable(batch) + unpickleable_returned = model.predict(unpickleable_object) + my_output = int(unpickleable_returned[0]) + return my_output +``` + +you could instead wrap the unpickleable pieces in a model wrapper. Since the model wrapper will sit in the inference process, this will work as long as it only takes in/returns pickleable objects. + +``` +class MyWrapper(): + def __init__(self, model): + self._model = model + + def predict(self, batch: Sequence[str]): + unpickleable_object = Unpickleable(batch) + unpickleable_returned = model.predict(unpickleable_object) + return int(prediction[0]) + +class MyModelHandler(): + def load_model(self): + return MyWrapper(model_loading_logic()) + + def run_inference(self, batch: Sequence[str], model: MyWrapper, inference_args): + return model.predict(unpickleable_object) +``` From bf42a8153af582e4dd97140bebf1a829f35dfe20 Mon Sep 17 00:00:00 2001 From: Robert Burke Date: Fri, 2 Aug 2024 14:12:26 -0700 Subject: [PATCH 06/78] [#32064] Keep elements heap in sequence order. (#32065) Co-authored-by: lostluck <13907733+lostluck@users.noreply.github.com> --- .../prism/internal/engine/elementmanager.go | 15 ++++++++++++++- .../beam/runners/prism/internal/engine/timers.go | 1 + 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/sdks/go/pkg/beam/runners/prism/internal/engine/elementmanager.go b/sdks/go/pkg/beam/runners/prism/internal/engine/elementmanager.go index a632318e02c7e..bc8449c72b39a 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/engine/elementmanager.go +++ b/sdks/go/pkg/beam/runners/prism/internal/engine/elementmanager.go @@ -45,6 +45,14 @@ type element struct { holdTimestamp mtime.Time // only used for Timers pane typex.PaneInfo transform, family, tag string // only used for Timers. + // Used to ensure ordering within a key when sorting the heap, + // which isn't using a stable sort. + // Since ordering is weak across multiple bundles, it needs only + // be consistent between exiting a stage and entering a stateful stage. + // No synchronization is required in specifying this, + // since keyed elements are only processed by a single bundle at a time, + // if stateful stages are concerned. + sequence int elmBytes []byte // When nil, indicates this is a timer. keyBytes []byte @@ -103,7 +111,8 @@ func (h elementHeap) Less(i, j int) bool { } else if h[i].IsData() && h[j].IsTimer() { return true // i before j. } - // They're the same kind, fall through to timestamp less for consistency. + // They're the same kind, so compare by the sequence value. + return h[i].sequence < h[j].sequence } // Otherwise compare by timestamp. return h[i].timestamp < h[j].timestamp @@ -688,6 +697,7 @@ func reElementResiduals(residuals []Residual, inputInfo PColInfo, rb RunBundle) pane: pn, elmBytes: elmBytes, keyBytes: keyBytes, + sequence: len(unprocessedElements), }) } } @@ -704,6 +714,7 @@ func reElementResiduals(residuals []Residual, inputInfo PColInfo, rb RunBundle) // PersistBundle takes in the stage ID, ID of the bundle associated with the pending // input elements, and the committed output elements. func (em *ElementManager) PersistBundle(rb RunBundle, col2Coders map[string]PColInfo, d TentativeData, inputInfo PColInfo, residuals Residuals) { + var seq int for output, data := range d.Raw { info := col2Coders[output] var newPending []element @@ -743,7 +754,9 @@ func (em *ElementManager) PersistBundle(rb RunBundle, col2Coders map[string]PCol pane: pn, elmBytes: elmBytes, keyBytes: keyBytes, + sequence: seq, }) + seq++ } } } diff --git a/sdks/go/pkg/beam/runners/prism/internal/engine/timers.go b/sdks/go/pkg/beam/runners/prism/internal/engine/timers.go index 3f52ebc4510cf..787d27858a0e5 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/engine/timers.go +++ b/sdks/go/pkg/beam/runners/prism/internal/engine/timers.go @@ -74,6 +74,7 @@ func decodeTimer(keyDec func(io.Reader) []byte, usesGlobalWindow bool, raw []byt timestamp: firing, holdTimestamp: hold, pane: pane, + sequence: len(ret), }) } return keyBytes, tag, ret From 7e7508731521862097c6e6230ac4bb6c53477965 Mon Sep 17 00:00:00 2001 From: Robert Bradshaw Date: Sat, 3 Aug 2024 22:06:12 -0700 Subject: [PATCH 07/78] Update top_wikipedia_sessions to be more idiomatic with beam.Map. (#32041) * Update top_wikipedia_sessions to be more idiomatic with beam.Map. * lint --- .../complete/top_wikipedia_sessions.py | 34 ++++++++----------- .../complete/top_wikipedia_sessions_test.py | 2 ++ 2 files changed, 16 insertions(+), 20 deletions(-) diff --git a/sdks/python/apache_beam/examples/complete/top_wikipedia_sessions.py b/sdks/python/apache_beam/examples/complete/top_wikipedia_sessions.py index 7064a5add13c7..50b026edf2402 100644 --- a/sdks/python/apache_beam/examples/complete/top_wikipedia_sessions.py +++ b/sdks/python/apache_beam/examples/complete/top_wikipedia_sessions.py @@ -61,14 +61,13 @@ MAX_TIMESTAMP = 0x7fffffffffffffff -class ExtractUserAndTimestampDoFn(beam.DoFn): +def extract_user_and_timestamp(element): """Extracts user and timestamp representing a Wikipedia edit.""" - def process(self, element): - table_row = json.loads(element) - if 'contributor_username' in table_row: - user_name = table_row['contributor_username'] - timestamp = table_row['timestamp'] - yield TimestampedValue(user_name, timestamp) + table_row = json.loads(element) + if 'contributor_username' in table_row: + user_name = table_row['contributor_username'] + timestamp = table_row['timestamp'] + return TimestampedValue(user_name, timestamp) class ComputeSessions(beam.PTransform): @@ -98,19 +97,15 @@ def expand(self, pcoll): without_defaults()) -class SessionsToStringsDoFn(beam.DoFn): +def sessions_to_strings(element, window=beam.DoFn.WindowParam): """Adds the session information to be part of the key.""" - def process(self, element, window=beam.DoFn.WindowParam): - yield (element[0] + ' : ' + str(window), element[1]) + return (element[0] + ' : ' + str(window), element[1]) -class FormatOutputDoFn(beam.DoFn): +def format_output(element, window=beam.DoFn.WindowParam): """Formats a string containing the user, count, and session.""" - def process(self, element, window=beam.DoFn.WindowParam): - for kv in element: - session = kv[0] - count = kv[1] - yield session + ' : ' + str(count) + ' : ' + str(window) + for session, count in element: + yield session + ' : ' + str(count) + ' : ' + str(window) class ComputeTopSessions(beam.PTransform): @@ -124,14 +119,13 @@ def __init__(self, sampling_threshold): def expand(self, pcoll): return ( pcoll - | - 'ExtractUserAndTimestamp' >> beam.ParDo(ExtractUserAndTimestampDoFn()) + | 'ExtractUserAndTimestamp' >> beam.Map(extract_user_and_timestamp) | beam.Filter( lambda x: (abs(hash(x)) <= MAX_TIMESTAMP * self.sampling_threshold)) | ComputeSessions() - | 'SessionsToStrings' >> beam.ParDo(SessionsToStringsDoFn()) + | 'SessionsToStrings' >> beam.Map(sessions_to_strings) | TopPerMonth() - | 'FormatOutput' >> beam.ParDo(FormatOutputDoFn())) + | 'FormatOutput' >> beam.FlatMap(format_output)) def run(argv=None): diff --git a/sdks/python/apache_beam/examples/complete/top_wikipedia_sessions_test.py b/sdks/python/apache_beam/examples/complete/top_wikipedia_sessions_test.py index 3c171664e45d2..92d1d196fe055 100644 --- a/sdks/python/apache_beam/examples/complete/top_wikipedia_sessions_test.py +++ b/sdks/python/apache_beam/examples/complete/top_wikipedia_sessions_test.py @@ -28,6 +28,8 @@ from apache_beam.testing.util import assert_that from apache_beam.testing.util import equal_to +# TODO: Unit test top_wikipedia_sessions.extract_user_and_timestamp, etc. + class ComputeTopSessionsTest(unittest.TestCase): From ca744ae9f6511c58ee08c391bc06e63c0f90ecce Mon Sep 17 00:00:00 2001 From: martin trieu Date: Mon, 5 Aug 2024 16:04:28 +0700 Subject: [PATCH 08/78] Add WorkProvider interfaces and implementations (#31883) --- .../worker/StreamingDataflowWorker.java | 342 +++++++----------- .../FanOutStreamingEngineWorkerHarness.java} | 63 ++-- .../harness/SingleSourceWorkerHarness.java | 284 +++++++++++++++ .../StreamingEngineConnectionState.java | 2 +- .../harness/StreamingWorkerHarness.java | 28 ++ .../harness}/WindmillStreamSender.java | 20 +- .../windmill/client/grpc/ChannelzServlet.java | 27 +- .../grpc/GetWorkResponseChunkAssembler.java | 139 +++++++ .../client/grpc/GrpcDirectGetWorkStream.java | 148 +++----- .../client/grpc/GrpcDispatcherClient.java | 4 +- .../client/grpc/GrpcGetWorkStream.java | 131 ++----- .../windmill/work/WorkItemScheduler.java | 4 - .../budget/EvenGetWorkBudgetDistributor.java | 27 +- .../work/budget/GetWorkBudgetDistributor.java | 5 +- .../work/budget/GetWorkBudgetSpender.java | 32 ++ .../worker/StreamingDataflowWorkerTest.java | 2 +- ...nOutStreamingEngineWorkerHarnessTest.java} | 40 +- .../harness}/WindmillStreamSenderTest.java | 5 +- .../client/grpc/ChannelzServletTest.java | 6 +- .../client/grpc/GrpcWindmillServerTest.java | 12 +- .../EvenGetWorkBudgetDistributorTest.java | 126 +++---- 21 files changed, 839 insertions(+), 608 deletions(-) rename runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/{windmill/client/grpc/StreamingEngineClient.java => streaming/harness/FanOutStreamingEngineWorkerHarness.java} (91%) create mode 100644 runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/harness/SingleSourceWorkerHarness.java rename runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/{windmill/client/grpc => streaming/harness}/StreamingEngineConnectionState.java (97%) create mode 100644 runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/harness/StreamingWorkerHarness.java rename runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/{windmill/client/grpc => streaming/harness}/WindmillStreamSender.java (93%) create mode 100644 runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GetWorkResponseChunkAssembler.java create mode 100644 runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/budget/GetWorkBudgetSpender.java rename runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/{windmill/client/grpc/StreamingEngineClientTest.java => streaming/harness/FanOutStreamingEngineWorkerHarnessTest.java} (93%) rename runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/{windmill/client/grpc => streaming/harness}/WindmillStreamSenderTest.java (97%) diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java index f196852b22532..90f072be997ed 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorker.java @@ -23,14 +23,13 @@ import com.google.api.services.dataflow.model.CounterUpdate; import com.google.api.services.dataflow.model.MapTask; import com.google.auto.value.AutoValue; -import java.util.Collection; -import java.util.Collections; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.Random; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; @@ -48,26 +47,25 @@ import org.apache.beam.runners.dataflow.worker.streaming.ComputationState; import org.apache.beam.runners.dataflow.worker.streaming.ComputationStateCache; import org.apache.beam.runners.dataflow.worker.streaming.StageInfo; -import org.apache.beam.runners.dataflow.worker.streaming.Watermarks; -import org.apache.beam.runners.dataflow.worker.streaming.Work; import org.apache.beam.runners.dataflow.worker.streaming.WorkHeartbeatResponseProcessor; import org.apache.beam.runners.dataflow.worker.streaming.config.ComputationConfig; import org.apache.beam.runners.dataflow.worker.streaming.config.StreamingApplianceComputationConfigFetcher; import org.apache.beam.runners.dataflow.worker.streaming.config.StreamingEngineComputationConfigFetcher; import org.apache.beam.runners.dataflow.worker.streaming.config.StreamingEnginePipelineConfig; +import org.apache.beam.runners.dataflow.worker.streaming.harness.SingleSourceWorkerHarness; +import org.apache.beam.runners.dataflow.worker.streaming.harness.SingleSourceWorkerHarness.GetWorkSender; import org.apache.beam.runners.dataflow.worker.streaming.harness.StreamingCounters; +import org.apache.beam.runners.dataflow.worker.streaming.harness.StreamingWorkerHarness; import org.apache.beam.runners.dataflow.worker.streaming.harness.StreamingWorkerStatusPages; import org.apache.beam.runners.dataflow.worker.streaming.harness.StreamingWorkerStatusReporter; import org.apache.beam.runners.dataflow.worker.util.BoundedQueueExecutor; import org.apache.beam.runners.dataflow.worker.util.MemoryMonitor; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.JobHeader; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill.LatencyAttribution; import org.apache.beam.runners.dataflow.worker.windmill.WindmillServerStub; import org.apache.beam.runners.dataflow.worker.windmill.WindmillServiceAddress; import org.apache.beam.runners.dataflow.worker.windmill.appliance.JniWindmillApplianceServer; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetDataStream; -import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetWorkStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStreamPool; import org.apache.beam.runners.dataflow.worker.windmill.client.commits.CompleteCommit; import org.apache.beam.runners.dataflow.worker.windmill.client.commits.StreamingApplianceWorkCommitter; @@ -104,12 +102,11 @@ import org.apache.beam.sdk.util.construction.CoderTranslation; import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.ManagedChannel; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.cache.CacheStats; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableSet; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.net.HostAndPort; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.ThreadFactoryBuilder; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.Uninterruptibles; import org.joda.time.Duration; import org.joda.time.Instant; import org.slf4j.Logger; @@ -119,7 +116,8 @@ @SuppressWarnings({ "nullness" // TODO(https://github.com/apache/beam/issues/20497) }) -public class StreamingDataflowWorker { +public final class StreamingDataflowWorker { + /** * Sinks are marked 'full' in {@link StreamingModeExecutionContext} once the amount of data sinked * (across all the sinks, if there are more than one) reaches this limit. This serves as hint for @@ -128,47 +126,44 @@ public class StreamingDataflowWorker { */ public static final int MAX_SINK_BYTES = 10_000_000; - // Maximum number of threads for processing. Currently, each thread processes one key at a time. - static final int MAX_PROCESSING_THREADS = 300; - static final long THREAD_EXPIRATION_TIME_SEC = 60; - static final int GET_WORK_STREAM_TIMEOUT_MINUTES = 3; - static final Duration COMMIT_STREAM_TIMEOUT = Duration.standardMinutes(1); private static final Logger LOG = LoggerFactory.getLogger(StreamingDataflowWorker.class); - private static final Duration GET_DATA_STREAM_TIMEOUT = Duration.standardSeconds(30); + + /** + * Maximum number of threads for processing. Currently, each thread processes one key at a time. + */ + private static final int MAX_PROCESSING_THREADS = 300; /** The idGenerator to generate unique id globally. */ private static final IdGenerator ID_GENERATOR = IdGenerators.decrementingLongs(); - private static final int DEFAULT_STATUS_PORT = 8081; - // Maximum size of the result of a GetWork request. + /** Maximum size of the result of a GetWork request. */ private static final long MAX_GET_WORK_FETCH_BYTES = 64L << 20; // 64m /** Maximum number of failure stacktraces to report in each update sent to backend. */ private static final int MAX_FAILURES_TO_REPORT_IN_UPDATE = 1000; - private static final Random clientIdGenerator = new Random(); + private static final long THREAD_EXPIRATION_TIME_SEC = 60; + private static final Duration COMMIT_STREAM_TIMEOUT = Duration.standardMinutes(1); + private static final Duration GET_DATA_STREAM_TIMEOUT = Duration.standardSeconds(30); + private static final int DEFAULT_STATUS_PORT = 8081; + private static final Random CLIENT_ID_GENERATOR = new Random(); private static final String CHANNELZ_PATH = "/channelz"; - final WindmillStateCache stateCache; + + private final WindmillStateCache stateCache; private final StreamingWorkerStatusPages statusPages; private final ComputationConfig.Fetcher configFetcher; private final ComputationStateCache computationStateCache; private final BoundedQueueExecutor workUnitExecutor; - private final WindmillServerStub windmillServer; - private final Thread dispatchThread; + private final StreamingWorkerHarness streamingWorkerHarness; private final AtomicBoolean running = new AtomicBoolean(); private final DataflowWorkerHarnessOptions options; - private final long clientId; - private final GetDataClient getDataClient; - private final MemoryMonitor memoryMonitor; - private final Thread memoryMonitorThread; + private final BackgroundMemoryMonitor memoryMonitor; private final ReaderCache readerCache; private final DataflowExecutionStateSampler sampler = DataflowExecutionStateSampler.instance(); private final ActiveWorkRefresher activeWorkRefresher; private final WorkCommitter workCommitter; private final StreamingWorkerStatusReporter workerStatusReporter; private final StreamingCounters streamingCounters; - private final StreamingWorkScheduler streamingWorkScheduler; - private final HeartbeatSender heartbeatSender; private StreamingDataflowWorker( WindmillServerStub windmillServer, @@ -226,39 +221,42 @@ private StreamingDataflowWorker( this.workUnitExecutor = workUnitExecutor; - memoryMonitorThread = new Thread(memoryMonitor); - memoryMonitorThread.setPriority(Thread.MIN_PRIORITY); - memoryMonitorThread.setName("MemoryMonitor"); - - dispatchThread = - new Thread( - () -> { - LOG.info("Dispatch starting"); - if (windmillServiceEnabled) { - streamingDispatchLoop(); - } else { - dispatchLoop(); - } - LOG.info("Dispatch done"); - }); - dispatchThread.setDaemon(true); - dispatchThread.setPriority(Thread.MIN_PRIORITY); - dispatchThread.setName("DispatchThread"); - this.clientId = clientId; - this.windmillServer = windmillServer; + this.workerStatusReporter = workerStatusReporter; + this.streamingCounters = streamingCounters; + this.memoryMonitor = BackgroundMemoryMonitor.create(memoryMonitor); + StreamingWorkScheduler streamingWorkScheduler = + StreamingWorkScheduler.create( + options, + clock, + readerCache, + mapTaskExecutorFactory, + workUnitExecutor, + stateCache::forComputation, + failureTracker, + workFailureProcessor, + streamingCounters, + hotKeyLogger, + sampler, + operationalLimits, + ID_GENERATOR, + stageInfoMap); ThrottlingGetDataMetricTracker getDataMetricTracker = new ThrottlingGetDataMetricTracker(memoryMonitor); - + WorkerStatusPages workerStatusPages = + WorkerStatusPages.create(DEFAULT_STATUS_PORT, memoryMonitor); + StreamingWorkerStatusPages.Builder statusPagesBuilder = StreamingWorkerStatusPages.builder(); int stuckCommitDurationMillis; + GetDataClient getDataClient; + HeartbeatSender heartbeatSender; if (windmillServiceEnabled) { WindmillStreamPool getDataStreamPool = WindmillStreamPool.create( Math.max(1, options.getWindmillGetDataStreamCount()), GET_DATA_STREAM_TIMEOUT, windmillServer::getDataStream); - this.getDataClient = new StreamPoolGetDataClient(getDataMetricTracker, getDataStreamPool); - this.heartbeatSender = + getDataClient = new StreamPoolGetDataClient(getDataMetricTracker, getDataStreamPool); + heartbeatSender = new StreamPoolHeartbeatSender( options.getUseSeparateWindmillHeartbeatStreams() ? WindmillStreamPool.create( @@ -266,9 +264,16 @@ private StreamingDataflowWorker( : getDataStreamPool); stuckCommitDurationMillis = options.getStuckCommitDurationMillis() > 0 ? options.getStuckCommitDurationMillis() : 0; + statusPagesBuilder + .setDebugCapture( + new DebugCapture.Manager(options, workerStatusPages.getDebugCapturePages())) + .setChannelzServlet( + new ChannelzServlet( + CHANNELZ_PATH, options, windmillServer::getWindmillServiceEndpoints)) + .setWindmillStreamFactory(windmillStreamFactory); } else { - this.getDataClient = new ApplianceGetDataClient(windmillServer, getDataMetricTracker); - this.heartbeatSender = new ApplianceHeartbeatSender(windmillServer::getData); + getDataClient = new ApplianceGetDataClient(windmillServer, getDataMetricTracker); + heartbeatSender = new ApplianceHeartbeatSender(windmillServer::getData); stuckCommitDurationMillis = 0; } @@ -282,49 +287,40 @@ private StreamingDataflowWorker( executorSupplier.apply("RefreshWork"), getDataMetricTracker::trackHeartbeats); - WorkerStatusPages workerStatusPages = - WorkerStatusPages.create(DEFAULT_STATUS_PORT, memoryMonitor); - StreamingWorkerStatusPages.Builder statusPagesBuilder = - StreamingWorkerStatusPages.builder() + this.statusPages = + statusPagesBuilder .setClock(clock) .setClientId(clientId) .setIsRunning(running) .setStatusPages(workerStatusPages) .setStateCache(stateCache) - .setComputationStateCache(computationStateCache) + .setComputationStateCache(this.computationStateCache) .setCurrentActiveCommitBytes(workCommitter::currentActiveCommitBytes) .setGetDataStatusProvider(getDataClient::printHtml) - .setWorkUnitExecutor(workUnitExecutor); + .setWorkUnitExecutor(workUnitExecutor) + .build(); - this.statusPages = - windmillServiceEnabled - ? statusPagesBuilder - .setDebugCapture( - new DebugCapture.Manager(options, workerStatusPages.getDebugCapturePages())) - .setChannelzServlet(new ChannelzServlet(CHANNELZ_PATH, options, windmillServer)) - .setWindmillStreamFactory(windmillStreamFactory) - .build() - : statusPagesBuilder.build(); + Windmill.GetWorkRequest request = + Windmill.GetWorkRequest.newBuilder() + .setClientId(clientId) + .setMaxItems(chooseMaximumBundlesOutstanding()) + .setMaxBytes(MAX_GET_WORK_FETCH_BYTES) + .build(); - this.workerStatusReporter = workerStatusReporter; - this.streamingCounters = streamingCounters; - this.memoryMonitor = memoryMonitor; - this.streamingWorkScheduler = - StreamingWorkScheduler.create( - options, - clock, - readerCache, - mapTaskExecutorFactory, - workUnitExecutor, - stateCache::forComputation, - failureTracker, - workFailureProcessor, - streamingCounters, - hotKeyLogger, - sampler, - operationalLimits, - ID_GENERATOR, - stageInfoMap); + this.streamingWorkerHarness = + SingleSourceWorkerHarness.builder() + .setStreamingWorkScheduler(streamingWorkScheduler) + .setWorkCommitter(workCommitter) + .setGetDataClient(getDataClient) + .setComputationStateFetcher(this.computationStateCache::get) + .setWaitForResources(() -> memoryMonitor.waitForResources("GetWork")) + .setHeartbeatSender(heartbeatSender) + .setGetWorkSender( + windmillServiceEnabled + ? GetWorkSender.forStreamingEngine( + receiver -> windmillServer.getWorkStream(request, receiver)) + : GetWorkSender.forAppliance(() -> windmillServer.getWork(request))) + .build(); LOG.debug("windmillServiceEnabled: {}", windmillServiceEnabled); LOG.debug("WindmillServiceEndpoint: {}", options.getWindmillServiceEndpoint()); @@ -333,7 +329,7 @@ private StreamingDataflowWorker( } public static StreamingDataflowWorker fromOptions(DataflowWorkerHarnessOptions options) { - long clientId = clientIdGenerator.nextLong(); + long clientId = CLIENT_ID_GENERATOR.nextLong(); MemoryMonitor memoryMonitor = MemoryMonitor.fromOptions(options); ConcurrentMap stageInfo = new ConcurrentHashMap<>(); StreamingCounters streamingCounters = StreamingCounters.create(); @@ -438,9 +434,10 @@ public static StreamingDataflowWorker fromOptions(DataflowWorkerHarnessOptions o ComputationConfig.Fetcher configFetcher; WindmillServerStub windmillServer; ComputationStateCache computationStateCache; - GrpcDispatcherClient dispatcherClient = GrpcDispatcherClient.create(createStubFactory(options)); GrpcWindmillStreamFactory windmillStreamFactory; if (options.isEnableStreamingEngine()) { + GrpcDispatcherClient dispatcherClient = + GrpcDispatcherClient.create(createStubFactory(options)); configFetcher = StreamingEngineComputationConfigFetcher.create( options.getGlobalConfigRefreshPeriod().getMillis(), @@ -469,7 +466,10 @@ public static StreamingDataflowWorker fromOptions(DataflowWorkerHarnessOptions o options.getWindmillServiceStreamingRpcHealthCheckPeriodMs()) .build(); windmillServer = - GrpcWindmillServer.create(options, windmillStreamFactory, dispatcherClient); + GrpcWindmillServer.create( + options, + windmillStreamFactory, + GrpcDispatcherClient.create(createStubFactory(options))); } else { windmillStreamFactory = windmillStreamFactoryBuilder.build(); windmillServer = new JniWindmillApplianceServer(options.getLocalWindmillHostport()); @@ -704,10 +704,6 @@ private static ChannelCachingStubFactory createStubFactory( return ChannelCachingRemoteStubFactory.create(workerOptions.getGcpCredential(), channelCache); } - private static void sleep(int millis) { - Uninterruptibles.sleepUninterruptibly(millis, TimeUnit.MILLISECONDS); - } - private static int chooseMaxThreads(DataflowWorkerHarnessOptions options) { if (options.getNumberOfWorkerHarnessThreads() != 0) { return options.getNumberOfWorkerHarnessThreads(); @@ -736,7 +732,7 @@ private static void enableBigQueryMetrics() { } @VisibleForTesting - final void reportPeriodicWorkerUpdatesForTest() { + void reportPeriodicWorkerUpdatesForTest() { workerStatusReporter.reportPeriodicWorkerUpdates(); } @@ -765,6 +761,11 @@ int numCommitThreads() { return workCommitter.parallelism(); } + @VisibleForTesting + CacheStats getStateCacheStats() { + return stateCache.getCacheStats(); + } + @VisibleForTesting ComputationStateCache getComputationStateCache() { return computationStateCache; @@ -773,14 +774,10 @@ ComputationStateCache getComputationStateCache() { @SuppressWarnings("FutureReturnValueIgnored") public void start() { running.set(true); - configFetcher.start(); - - memoryMonitorThread.start(); - dispatchThread.start(); + memoryMonitor.start(); + streamingWorkerHarness.start(); sampler.start(); - - workCommitter.start(); workerStatusReporter.start(); activeWorkRefresher.start(); } @@ -794,121 +791,19 @@ private void startStatusPages() { void stop() { try { configFetcher.stop(); - activeWorkRefresher.stop(); statusPages.stop(); running.set(false); - dispatchThread.interrupt(); - dispatchThread.join(); - - workCommitter.stop(); - memoryMonitor.stop(); - memoryMonitorThread.join(); + streamingWorkerHarness.shutdown(); + memoryMonitor.shutdown(); workUnitExecutor.shutdown(); - computationStateCache.closeAndInvalidateAll(); - workerStatusReporter.stop(); } catch (Exception e) { LOG.warn("Exception while shutting down: ", e); } } - private void dispatchLoop() { - while (running.get()) { - memoryMonitor.waitForResources("GetWork"); - - int backoff = 1; - Windmill.GetWorkResponse workResponse = null; - do { - try { - workResponse = getWork(); - if (workResponse.getWorkCount() > 0) { - break; - } - } catch (WindmillServerStub.RpcException e) { - LOG.warn("GetWork failed, retrying:", e); - } - sleep(backoff); - backoff = Math.min(1000, backoff * 2); - } while (running.get()); - for (final Windmill.ComputationWorkItems computationWork : workResponse.getWorkList()) { - final String computationId = computationWork.getComputationId(); - Optional maybeComputationState = computationStateCache.get(computationId); - if (!maybeComputationState.isPresent()) { - continue; - } - - final ComputationState computationState = maybeComputationState.get(); - final Instant inputDataWatermark = - WindmillTimeUtils.windmillToHarnessWatermark(computationWork.getInputDataWatermark()); - Watermarks.Builder watermarks = - Watermarks.builder() - .setInputDataWatermark(Preconditions.checkNotNull(inputDataWatermark)) - .setSynchronizedProcessingTime( - WindmillTimeUtils.windmillToHarnessWatermark( - computationWork.getDependentRealtimeInputWatermark())); - - for (final Windmill.WorkItem workItem : computationWork.getWorkList()) { - streamingWorkScheduler.scheduleWork( - computationState, - workItem, - watermarks.setOutputDataWatermark(workItem.getOutputDataWatermark()).build(), - Work.createProcessingContext( - computationId, getDataClient, workCommitter::commit, heartbeatSender), - /* getWorkStreamLatencies= */ Collections.emptyList()); - } - } - } - } - - void streamingDispatchLoop() { - while (running.get()) { - GetWorkStream stream = - windmillServer.getWorkStream( - Windmill.GetWorkRequest.newBuilder() - .setClientId(clientId) - .setMaxItems(chooseMaximumBundlesOutstanding()) - .setMaxBytes(MAX_GET_WORK_FETCH_BYTES) - .build(), - (String computation, - Instant inputDataWatermark, - Instant synchronizedProcessingTime, - Windmill.WorkItem workItem, - Collection getWorkStreamLatencies) -> - computationStateCache - .get(computation) - .ifPresent( - computationState -> { - memoryMonitor.waitForResources("GetWork"); - streamingWorkScheduler.scheduleWork( - computationState, - workItem, - Watermarks.builder() - .setInputDataWatermark(inputDataWatermark) - .setSynchronizedProcessingTime(synchronizedProcessingTime) - .setOutputDataWatermark(workItem.getOutputDataWatermark()) - .build(), - Work.createProcessingContext( - computationState.getComputationId(), - getDataClient, - workCommitter::commit, - heartbeatSender), - getWorkStreamLatencies); - })); - try { - // Reconnect every now and again to enable better load balancing. - // If at any point the server closes the stream, we will reconnect immediately; otherwise - // we half-close the stream after some time and create a new one. - if (!stream.awaitTermination(GET_WORK_STREAM_TIMEOUT_MINUTES, TimeUnit.MINUTES)) { - stream.halfClose(); - } - } catch (InterruptedException e) { - // Continue processing until !running.get() - } - } - } - private void onCompleteCommit(CompleteCommit completeCommit) { if (completeCommit.status() != Windmill.CommitStatus.OK) { readerCache.invalidateReader( @@ -927,15 +822,6 @@ private void onCompleteCommit(CompleteCommit completeCommit) { completeCommit.shardedKey(), completeCommit.workId())); } - private Windmill.GetWorkResponse getWork() { - return windmillServer.getWork( - Windmill.GetWorkRequest.newBuilder() - .setClientId(clientId) - .setMaxItems(chooseMaximumBundlesOutstanding()) - .setMaxBytes(MAX_GET_WORK_FETCH_BYTES) - .build()); - } - @VisibleForTesting public Iterable buildCounters() { return Iterables.concat( @@ -967,4 +853,34 @@ private static ConfigFetcherComputationStateCacheAndWindmillClient create( abstract GrpcWindmillStreamFactory windmillStreamFactory(); } + + /** + * Monitors memory pressure on a background executor. May be used to throttle calls, blocking if + * there is memory pressure. + */ + @AutoValue + abstract static class BackgroundMemoryMonitor { + private static BackgroundMemoryMonitor create(MemoryMonitor memoryMonitor) { + return new AutoValue_StreamingDataflowWorker_BackgroundMemoryMonitor( + memoryMonitor, + Executors.newSingleThreadScheduledExecutor( + new ThreadFactoryBuilder() + .setNameFormat("MemoryMonitor") + .setPriority(Thread.MIN_PRIORITY) + .build())); + } + + abstract MemoryMonitor memoryMonitor(); + + abstract ExecutorService executor(); + + private void start() { + executor().execute(memoryMonitor()); + } + + private void shutdown() { + memoryMonitor().stop(); + executor().shutdown(); + } + } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/StreamingEngineClient.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/harness/FanOutStreamingEngineWorkerHarness.java similarity index 91% rename from runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/StreamingEngineClient.java rename to runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/harness/FanOutStreamingEngineWorkerHarness.java index b9573ff94cc9a..3556b7ce29198 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/StreamingEngineClient.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/harness/FanOutStreamingEngineWorkerHarness.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.beam.runners.dataflow.worker.windmill.client.grpc; +package org.apache.beam.runners.dataflow.worker.streaming.harness; import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap.toImmutableMap; import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableSet.toImmutableSet; @@ -47,6 +47,8 @@ import org.apache.beam.runners.dataflow.worker.windmill.client.commits.WorkCommitter; import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.StreamGetDataClient; import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.ThrottlingGetDataMetricTracker; +import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.GrpcDispatcherClient; +import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.GrpcWindmillStreamFactory; import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.stubs.ChannelCachingStubFactory; import org.apache.beam.runners.dataflow.worker.windmill.client.throttling.ThrottleTimer; import org.apache.beam.runners.dataflow.worker.windmill.work.WorkItemScheduler; @@ -68,16 +70,19 @@ import org.slf4j.LoggerFactory; /** - * Client for StreamingEngine. Given a {@link GetWorkBudget}, divides the budget and starts the - * {@link WindmillStream.GetWorkStream}(s). + * {@link StreamingWorkerHarness} implementation that manages fan out to multiple backend + * destinations. Given a {@link GetWorkBudget}, divides the budget and starts the {@link + * WindmillStream.GetWorkStream}(s). */ @Internal @CheckReturnValue @ThreadSafe -public final class StreamingEngineClient { - private static final Logger LOG = LoggerFactory.getLogger(StreamingEngineClient.class); +public final class FanOutStreamingEngineWorkerHarness implements StreamingWorkerHarness { + private static final Logger LOG = + LoggerFactory.getLogger(FanOutStreamingEngineWorkerHarness.class); private static final String PUBLISH_NEW_WORKER_METADATA_THREAD = "PublishNewWorkerMetadataThread"; private static final String CONSUME_NEW_WORKER_METADATA_THREAD = "ConsumeNewWorkerMetadataThread"; + private final JobHeader jobHeader; private final GrpcWindmillStreamFactory streamFactory; private final WorkItemScheduler workItemScheduler; @@ -101,7 +106,7 @@ public final class StreamingEngineClient { private volatile boolean started; @SuppressWarnings("FutureReturnValueIgnored") - private StreamingEngineClient( + private FanOutStreamingEngineWorkerHarness( JobHeader jobHeader, GetWorkBudget totalGetWorkBudget, GrpcWindmillStreamFactory streamFactory, @@ -152,23 +157,15 @@ private StreamingEngineClient( private static ExecutorService singleThreadedExecutorServiceOf(String threadName) { return Executors.newSingleThreadScheduledExecutor( - new ThreadFactoryBuilder() - .setNameFormat(threadName) - .setUncaughtExceptionHandler( - (t, e) -> { - LOG.error( - "{} failed due to uncaught exception during execution. ", t.getName(), e); - throw new StreamingEngineClientException(e); - }) - .build()); + new ThreadFactoryBuilder().setNameFormat(threadName).build()); } /** - * Creates an instance of {@link StreamingEngineClient} in a non-started state. + * Creates an instance of {@link FanOutStreamingEngineWorkerHarness} in a non-started state. * * @implNote Does not block the calling thread. Callers must explicitly call {@link #start()}. */ - public static StreamingEngineClient create( + public static FanOutStreamingEngineWorkerHarness create( JobHeader jobHeader, GetWorkBudget totalGetWorkBudget, GrpcWindmillStreamFactory streamingEngineStreamFactory, @@ -178,7 +175,7 @@ public static StreamingEngineClient create( GrpcDispatcherClient dispatcherClient, Function workCommitterFactory, ThrottlingGetDataMetricTracker getDataMetricTracker) { - return new StreamingEngineClient( + return new FanOutStreamingEngineWorkerHarness( jobHeader, totalGetWorkBudget, streamingEngineStreamFactory, @@ -192,7 +189,7 @@ public static StreamingEngineClient create( } @VisibleForTesting - static StreamingEngineClient forTesting( + static FanOutStreamingEngineWorkerHarness forTesting( JobHeader jobHeader, GetWorkBudget totalGetWorkBudget, GrpcWindmillStreamFactory streamFactory, @@ -203,8 +200,8 @@ static StreamingEngineClient forTesting( long clientId, Function workCommitterFactory, ThrottlingGetDataMetricTracker getDataMetricTracker) { - StreamingEngineClient streamingEngineClient = - new StreamingEngineClient( + FanOutStreamingEngineWorkerHarness fanOutStreamingEngineWorkProvider = + new FanOutStreamingEngineWorkerHarness( jobHeader, totalGetWorkBudget, streamFactory, @@ -215,11 +212,12 @@ static StreamingEngineClient forTesting( clientId, workCommitterFactory, getDataMetricTracker); - streamingEngineClient.start(); - return streamingEngineClient; + fanOutStreamingEngineWorkProvider.start(); + return fanOutStreamingEngineWorkProvider; } @SuppressWarnings("ReturnValueIgnored") + @Override public synchronized void start() { Preconditions.checkState(!started, "StreamingEngineClient cannot start twice."); // Starts the stream, this value is memoized. @@ -270,7 +268,8 @@ private void startWorkerMetadataConsumer() { } @VisibleForTesting - public synchronized void finish() { + @Override + public synchronized void shutdown() { Preconditions.checkState(started, "StreamingEngineClient never started."); getWorkerMetadataStream.get().halfClose(); getWorkBudgetRefresher.stop(); @@ -334,10 +333,13 @@ private synchronized ImmutableMap createNewWindmil .collect( toImmutableMap( Function.identity(), - // Reuse existing stubs if they exist. endpoint -> - currentConnections.getOrDefault( - endpoint, WindmillConnection.from(endpoint, this::createWindmillStub)))); + // Reuse existing stubs if they exist. Optional.orElseGet only calls the + // supplier if the value is not present, preventing constructing expensive + // objects. + Optional.ofNullable(currentConnections.get(endpoint)) + .orElseGet( + () -> WindmillConnection.from(endpoint, this::createWindmillStub)))); } private synchronized ImmutableMap @@ -423,11 +425,4 @@ private CloudWindmillServiceV1Alpha1Stub createWindmillStub(Endpoint endpoint) { .map(channelCachingStubFactory::createWindmillServiceStub) .orElseGet(dispatcherClient::getWindmillServiceStub); } - - private static class StreamingEngineClientException extends IllegalStateException { - - private StreamingEngineClientException(Throwable exception) { - super(exception); - } - } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/harness/SingleSourceWorkerHarness.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/harness/SingleSourceWorkerHarness.java new file mode 100644 index 0000000000000..bc93e6d89c415 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/harness/SingleSourceWorkerHarness.java @@ -0,0 +1,284 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.streaming.harness; + +import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.Uninterruptibles.sleepUninterruptibly; + +import com.google.auto.value.AutoBuilder; +import com.google.auto.value.AutoOneOf; +import java.util.Collections; +import java.util.Optional; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.function.Function; +import org.apache.beam.runners.dataflow.worker.WindmillTimeUtils; +import org.apache.beam.runners.dataflow.worker.streaming.ComputationState; +import org.apache.beam.runners.dataflow.worker.streaming.Watermarks; +import org.apache.beam.runners.dataflow.worker.streaming.Work; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.runners.dataflow.worker.windmill.WindmillServerStub.RpcException; +import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.commits.WorkCommitter; +import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.GetDataClient; +import org.apache.beam.runners.dataflow.worker.windmill.work.WorkItemReceiver; +import org.apache.beam.runners.dataflow.worker.windmill.work.processing.StreamingWorkScheduler; +import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; +import org.apache.beam.sdk.annotations.Internal; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Supplier; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.util.concurrent.ThreadFactoryBuilder; +import org.joda.time.Instant; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * {@link StreamingWorkerHarness} implementations that fetch {@link + * org.apache.beam.runners.dataflow.worker.windmill.Windmill.WorkItem}(s) from a single source. + */ +@Internal +public final class SingleSourceWorkerHarness implements StreamingWorkerHarness { + private static final Logger LOG = LoggerFactory.getLogger(SingleSourceWorkerHarness.class); + private static final int GET_WORK_STREAM_TIMEOUT_MINUTES = 3; + + private final AtomicBoolean isRunning; + private final WorkCommitter workCommitter; + private final GetDataClient getDataClient; + private final HeartbeatSender heartbeatSender; + private final StreamingWorkScheduler streamingWorkScheduler; + private final Runnable waitForResources; + private final Function> computationStateFetcher; + private final ExecutorService workProviderExecutor; + private final GetWorkSender getWorkSender; + + SingleSourceWorkerHarness( + WorkCommitter workCommitter, + GetDataClient getDataClient, + HeartbeatSender heartbeatSender, + StreamingWorkScheduler streamingWorkScheduler, + Runnable waitForResources, + Function> computationStateFetcher, + GetWorkSender getWorkSender) { + this.workCommitter = workCommitter; + this.getDataClient = getDataClient; + this.heartbeatSender = heartbeatSender; + this.streamingWorkScheduler = streamingWorkScheduler; + this.waitForResources = waitForResources; + this.computationStateFetcher = computationStateFetcher; + this.workProviderExecutor = + Executors.newSingleThreadScheduledExecutor( + new ThreadFactoryBuilder() + .setDaemon(true) + .setPriority(Thread.MIN_PRIORITY) + .setNameFormat("DispatchThread") + .build()); + this.isRunning = new AtomicBoolean(false); + this.getWorkSender = getWorkSender; + } + + public static SingleSourceWorkerHarness.Builder builder() { + return new AutoBuilder_SingleSourceWorkerHarness_Builder(); + } + + @Override + public void start() { + Preconditions.checkState( + isRunning.compareAndSet(false, true), + "Multiple calls to {}.start() are not allowed.", + getClass()); + workCommitter.start(); + workProviderExecutor.execute( + () -> { + getDispatchLoop().run(); + LOG.info("Dispatch done"); + }); + } + + private Runnable getDispatchLoop() { + switch (getWorkSender.getKind()) { + case APPLIANCE: + LOG.info("Starting Dispatch in Appliance mode."); + return () -> applianceDispatchLoop(getWorkSender.appliance()); + case STREAMING_ENGINE: + LOG.info("Starting Dispatch in Streaming Engine mode."); + return () -> streamingEngineDispatchLoop(getWorkSender.streamingEngine()); + default: + // Will never happen switch is exhaustive. + throw new IllegalStateException("Invalid GetWorkSender.Kind: " + getWorkSender.getKind()); + } + } + + @Override + public void shutdown() { + Preconditions.checkState( + isRunning.compareAndSet(true, false), + "Multiple calls to {}.shutdown() are not allowed.", + getClass()); + workProviderExecutor.shutdown(); + boolean isTerminated = false; + try { + isTerminated = workProviderExecutor.awaitTermination(10, TimeUnit.SECONDS); + } catch (InterruptedException e) { + LOG.warn("Unable to shutdown {}", getClass()); + } + + if (!isTerminated) { + workProviderExecutor.shutdownNow(); + } + workCommitter.stop(); + } + + private void streamingEngineDispatchLoop( + Function getWorkStreamFactory) { + while (isRunning.get()) { + WindmillStream.GetWorkStream stream = + getWorkStreamFactory.apply( + (computationId, + inputDataWatermark, + synchronizedProcessingTime, + workItem, + getWorkStreamLatencies) -> + computationStateFetcher + .apply(computationId) + .ifPresent( + computationState -> { + waitForResources.run(); + streamingWorkScheduler.scheduleWork( + computationState, + workItem, + Watermarks.builder() + .setInputDataWatermark( + Preconditions.checkNotNull(inputDataWatermark)) + .setSynchronizedProcessingTime(synchronizedProcessingTime) + .setOutputDataWatermark(workItem.getOutputDataWatermark()) + .build(), + Work.createProcessingContext( + computationId, + getDataClient, + workCommitter::commit, + heartbeatSender), + getWorkStreamLatencies); + })); + try { + // Reconnect every now and again to enable better load balancing. + // If at any point the server closes the stream, we will reconnect immediately; otherwise + // we half-close the stream after some time and create a new one. + if (!stream.awaitTermination(GET_WORK_STREAM_TIMEOUT_MINUTES, TimeUnit.MINUTES)) { + stream.halfClose(); + } + } catch (InterruptedException e) { + // Continue processing until !running.get() + } + } + } + + private void applianceDispatchLoop(Supplier getWorkFn) { + while (isRunning.get()) { + waitForResources.run(); + int backoff = 1; + Windmill.GetWorkResponse workResponse = null; + do { + try { + workResponse = getWorkFn.get(); + if (workResponse.getWorkCount() > 0) { + break; + } + } catch (RpcException e) { + LOG.warn("GetWork failed, retrying:", e); + } + sleepUninterruptibly(backoff, TimeUnit.MILLISECONDS); + backoff = Math.min(1000, backoff * 2); + } while (isRunning.get()); + for (Windmill.ComputationWorkItems computationWork : + Preconditions.checkNotNull(workResponse).getWorkList()) { + String computationId = computationWork.getComputationId(); + Optional maybeComputationState = + computationStateFetcher.apply(computationId); + if (!maybeComputationState.isPresent()) { + continue; + } + + ComputationState computationState = maybeComputationState.get(); + Instant inputDataWatermark = + WindmillTimeUtils.windmillToHarnessWatermark(computationWork.getInputDataWatermark()); + Watermarks.Builder watermarks = + Watermarks.builder() + .setInputDataWatermark(Preconditions.checkNotNull(inputDataWatermark)) + .setSynchronizedProcessingTime( + WindmillTimeUtils.windmillToHarnessWatermark( + computationWork.getDependentRealtimeInputWatermark())); + + for (Windmill.WorkItem workItem : computationWork.getWorkList()) { + streamingWorkScheduler.scheduleWork( + computationState, + workItem, + watermarks.setOutputDataWatermark(workItem.getOutputDataWatermark()).build(), + Work.createProcessingContext( + computationId, getDataClient, workCommitter::commit, heartbeatSender), + /* getWorkStreamLatencies= */ Collections.emptyList()); + } + } + } + } + + @AutoBuilder + public interface Builder { + Builder setWorkCommitter(WorkCommitter workCommitter); + + Builder setGetDataClient(GetDataClient getDataClient); + + Builder setHeartbeatSender(HeartbeatSender heartbeatSender); + + Builder setStreamingWorkScheduler(StreamingWorkScheduler streamingWorkScheduler); + + Builder setWaitForResources(Runnable waitForResources); + + Builder setComputationStateFetcher( + Function> computationStateFetcher); + + Builder setGetWorkSender(GetWorkSender getWorkSender); + + SingleSourceWorkerHarness build(); + } + + @AutoOneOf(GetWorkSender.Kind.class) + public abstract static class GetWorkSender { + + public static GetWorkSender forStreamingEngine( + Function getWorkStreamFactory) { + return AutoOneOf_SingleSourceWorkerHarness_GetWorkSender.streamingEngine( + getWorkStreamFactory); + } + + public static GetWorkSender forAppliance(Supplier getWorkFn) { + return AutoOneOf_SingleSourceWorkerHarness_GetWorkSender.appliance(getWorkFn); + } + + abstract Function streamingEngine(); + + abstract Supplier appliance(); + + abstract Kind getKind(); + + enum Kind { + STREAMING_ENGINE, + APPLIANCE + } + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/StreamingEngineConnectionState.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/harness/StreamingEngineConnectionState.java similarity index 97% rename from runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/StreamingEngineConnectionState.java rename to runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/harness/StreamingEngineConnectionState.java index 8d784456d655b..3c85ee6abe1f5 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/StreamingEngineConnectionState.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/harness/StreamingEngineConnectionState.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.beam.runners.dataflow.worker.windmill.client.grpc; +package org.apache.beam.runners.dataflow.worker.streaming.harness; import com.google.auto.value.AutoValue; import java.util.function.Supplier; diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/harness/StreamingWorkerHarness.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/harness/StreamingWorkerHarness.java new file mode 100644 index 0000000000000..c1b4570e22600 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/harness/StreamingWorkerHarness.java @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.streaming.harness; + +import org.apache.beam.sdk.annotations.Internal; + +/** Provides an interface to start streaming worker processing. */ +@Internal +public interface StreamingWorkerHarness { + void start(); + + void shutdown(); +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/WindmillStreamSender.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/harness/WindmillStreamSender.java similarity index 93% rename from runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/WindmillStreamSender.java rename to runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/harness/WindmillStreamSender.java index 7d09726e4b28a..45aa403ee71b4 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/WindmillStreamSender.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/streaming/harness/WindmillStreamSender.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.beam.runners.dataflow.worker.windmill.client.grpc; +package org.apache.beam.runners.dataflow.worker.streaming.harness; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicReference; @@ -29,9 +29,11 @@ import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetWorkStream; import org.apache.beam.runners.dataflow.worker.windmill.client.commits.WorkCommitter; import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.GetDataClient; +import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.GrpcWindmillStreamFactory; import org.apache.beam.runners.dataflow.worker.windmill.client.throttling.StreamingEngineThrottleTimers; import org.apache.beam.runners.dataflow.worker.windmill.work.WorkItemScheduler; import org.apache.beam.runners.dataflow.worker.windmill.work.budget.GetWorkBudget; +import org.apache.beam.runners.dataflow.worker.windmill.work.budget.GetWorkBudgetSpender; import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.FixedStreamHeartbeatSender; import org.apache.beam.sdk.annotations.Internal; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Suppliers; @@ -57,7 +59,7 @@ */ @Internal @ThreadSafe -public class WindmillStreamSender { +final class WindmillStreamSender implements GetWorkBudgetSpender { private final AtomicBoolean started; private final AtomicReference getWorkBudget; private final Supplier getWorkStream; @@ -107,7 +109,7 @@ private WindmillStreamSender( workItemScheduler)); } - public static WindmillStreamSender create( + static WindmillStreamSender create( WindmillConnection connection, GetWorkRequest getWorkRequest, GetWorkBudget getWorkBudget, @@ -151,6 +153,7 @@ void closeAllStreams() { } } + @Override public void adjustBudget(long itemsDelta, long bytesDelta) { getWorkBudget.set(getWorkBudget.get().apply(itemsDelta, bytesDelta)); if (started.get()) { @@ -158,19 +161,16 @@ public void adjustBudget(long itemsDelta, long bytesDelta) { } } - public void adjustBudget(GetWorkBudget adjustment) { - adjustBudget(adjustment.items(), adjustment.bytes()); - } - - public GetWorkBudget remainingGetWorkBudget() { + @Override + public GetWorkBudget remainingBudget() { return started.get() ? getWorkStream.get().remainingBudget() : getWorkBudget.get(); } - public long getAndResetThrottleTime() { + long getAndResetThrottleTime() { return streamingEngineThrottleTimers.getAndResetThrottleTime(); } - public long getCurrentActiveCommitBytes() { + long getCurrentActiveCommitBytes() { return started.get() ? workCommitter.get().currentActiveCommitBytes() : 0; } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/ChannelzServlet.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/ChannelzServlet.java index e0f823d79ade5..adfb380d21647 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/ChannelzServlet.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/ChannelzServlet.java @@ -23,6 +23,7 @@ import java.util.HashSet; import java.util.List; import java.util.Set; +import java.util.function.Supplier; import java.util.stream.Collectors; import javax.annotation.Nullable; import javax.servlet.ServletException; @@ -31,7 +32,6 @@ import org.apache.beam.runners.dataflow.options.DataflowStreamingPipelineOptions; import org.apache.beam.runners.dataflow.worker.status.BaseStatusServlet; import org.apache.beam.runners.dataflow.worker.status.DebugCapture; -import org.apache.beam.runners.dataflow.worker.windmill.WindmillServerStub; import org.apache.beam.sdk.annotations.Internal; import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.channelz.v1.*; import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.protobuf.services.ChannelzService; @@ -47,16 +47,16 @@ public class ChannelzServlet extends BaseStatusServlet implements DebugCapture.C private static final int MAX_TOP_CHANNELS_TO_RETURN = 500; private final ChannelzService channelzService; - private final WindmillServerStub windmillServerStub; + private final Supplier> currentWindmillEndpoints; private final boolean showOnlyWindmillServiceChannels; public ChannelzServlet( String path, DataflowStreamingPipelineOptions options, - WindmillServerStub windmillServerStub) { + Supplier> currentWindmillEndpoints) { super(path); channelzService = ChannelzService.newInstance(MAX_TOP_CHANNELS_TO_RETURN); - this.windmillServerStub = windmillServerStub; + this.currentWindmillEndpoints = currentWindmillEndpoints; showOnlyWindmillServiceChannels = options.getChannelzShowOnlyWindmillServiceChannels(); } @@ -81,14 +81,6 @@ public void captureData(PrintWriter writer) { writer.println(""); } - // channelz proto says there won't be cycles in the ref graph. - // we track visited ids to be defensive and prevent any accidental cycles. - private static class VisitedSets { - - Set channels = new HashSet<>(); - Set subchannels = new HashSet<>(); - } - private void appendTopChannels(PrintWriter writer) { SettableFuture future = SettableFuture.create(); // IDEA: If there are more than MAX_TOP_CHANNELS_TO_RETURN top channels @@ -127,8 +119,7 @@ private void appendTopChannels(PrintWriter writer) { } private List filterWindmillChannels(List channels) { - ImmutableSet windmillServiceEndpoints = - windmillServerStub.getWindmillServiceEndpoints(); + ImmutableSet windmillServiceEndpoints = currentWindmillEndpoints.get(); Set windmillServiceHosts = windmillServiceEndpoints.stream().map(HostAndPort::getHost).collect(Collectors.toSet()); List windmillChannels = new ArrayList<>(); @@ -291,4 +282,12 @@ public void onCompleted() { } }; } + + // channelz proto says there won't be cycles in the ref graph. + // we track visited ids to be defensive and prevent any accidental cycles. + private static class VisitedSets { + + Set channels = new HashSet<>(); + Set subchannels = new HashSet<>(); + } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GetWorkResponseChunkAssembler.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GetWorkResponseChunkAssembler.java new file mode 100644 index 0000000000000..9f30f75919f97 --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GetWorkResponseChunkAssembler.java @@ -0,0 +1,139 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.client.grpc; + +import com.google.auto.value.AutoValue; +import java.io.IOException; +import java.util.List; +import java.util.Optional; +import javax.annotation.Nullable; +import javax.annotation.concurrent.NotThreadSafe; +import org.apache.beam.runners.dataflow.worker.WindmillTimeUtils; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill; +import org.apache.beam.runners.dataflow.worker.windmill.Windmill.WorkItem; +import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; +import org.joda.time.Instant; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * {@link ByteString} buffer of {@link + * org.apache.beam.runners.dataflow.worker.windmill.Windmill.StreamingGetWorkResponseChunk}(s). + * + *

Once all serialized chunks of an {@link WorkItem} have been received flushes (deserializes) + * the chunk of bytes and metadata into an {@link AssembledWorkItem}. + * + * @implNote This class is not thread safe, and provides no synchronization underneath. + */ +@NotThreadSafe +final class GetWorkResponseChunkAssembler { + private static final Logger LOG = LoggerFactory.getLogger(GetWorkResponseChunkAssembler.class); + + private final GetWorkTimingInfosTracker workTimingInfosTracker; + private @Nullable ComputationMetadata metadata; + private ByteString data; + private long bufferedSize; + + GetWorkResponseChunkAssembler() { + workTimingInfosTracker = new GetWorkTimingInfosTracker(System::currentTimeMillis); + data = ByteString.EMPTY; + bufferedSize = 0; + metadata = null; + } + + /** + * Appends the response chunk bytes to the {@link #data }byte buffer. Return the assembled + * WorkItem if all response chunks for a WorkItem have been received. + */ + Optional append(Windmill.StreamingGetWorkResponseChunk chunk) { + if (chunk.hasComputationMetadata()) { + metadata = ComputationMetadata.fromProto(chunk.getComputationMetadata()); + } + + data = data.concat(chunk.getSerializedWorkItem()); + bufferedSize += chunk.getSerializedWorkItem().size(); + workTimingInfosTracker.addTimingInfo(chunk.getPerWorkItemTimingInfosList()); + + // If the entire WorkItem has been received, assemble the WorkItem. + return chunk.getRemainingBytesForWorkItem() == 0 ? flushToWorkItem() : Optional.empty(); + } + + /** + * Attempt to flush the {@link #data} bytes into a {@link WorkItem} w/ it's metadata. Resets the + * data byte string and tracking metadata afterwards, whether the {@link WorkItem} deserialization + * was successful or not. + */ + private Optional flushToWorkItem() { + try { + return Optional.of( + AssembledWorkItem.create( + WorkItem.parseFrom(data.newInput()), + Preconditions.checkNotNull(metadata), + workTimingInfosTracker.getLatencyAttributions(), + bufferedSize)); + } catch (IOException e) { + LOG.error("Failed to parse work item from stream: ", e); + } finally { + workTimingInfosTracker.reset(); + data = ByteString.EMPTY; + bufferedSize = 0; + } + + return Optional.empty(); + } + + @AutoValue + abstract static class ComputationMetadata { + private static ComputationMetadata fromProto( + Windmill.ComputationWorkItemMetadata metadataProto) { + return new AutoValue_GetWorkResponseChunkAssembler_ComputationMetadata( + metadataProto.getComputationId(), + WindmillTimeUtils.windmillToHarnessWatermark(metadataProto.getInputDataWatermark()), + WindmillTimeUtils.windmillToHarnessWatermark( + metadataProto.getDependentRealtimeInputWatermark())); + } + + abstract String computationId(); + + abstract Instant inputDataWatermark(); + + abstract Instant synchronizedProcessingTime(); + } + + @AutoValue + abstract static class AssembledWorkItem { + + private static AssembledWorkItem create( + WorkItem workItem, + ComputationMetadata computationMetadata, + List latencyAttributions, + long size) { + return new AutoValue_GetWorkResponseChunkAssembler_AssembledWorkItem( + workItem, computationMetadata, latencyAttributions, size); + } + + abstract WorkItem workItem(); + + abstract ComputationMetadata computationMetadata(); + + abstract List latencyAttributions(); + + abstract long bufferedSize(); + } +} diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcDirectGetWorkStream.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcDirectGetWorkStream.java index 58f72610e2d35..45d010d7cfac5 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcDirectGetWorkStream.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcDirectGetWorkStream.java @@ -17,8 +17,6 @@ */ package org.apache.beam.runners.dataflow.worker.windmill.client.grpc; -import com.google.auto.value.AutoValue; -import java.io.IOException; import java.io.PrintWriter; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; @@ -26,12 +24,9 @@ import java.util.concurrent.atomic.AtomicReference; import java.util.function.Function; import java.util.function.Supplier; -import javax.annotation.Nullable; -import org.apache.beam.runners.dataflow.worker.WindmillTimeUtils; import org.apache.beam.runners.dataflow.worker.streaming.Watermarks; import org.apache.beam.runners.dataflow.worker.streaming.Work; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill.ComputationWorkItemMetadata; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GetWorkRequest; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.StreamingGetWorkRequest; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.StreamingGetWorkResponseChunk; @@ -40,6 +35,7 @@ import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetWorkStream; import org.apache.beam.runners.dataflow.worker.windmill.client.commits.WorkCommitter; import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.GetDataClient; +import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.GetWorkResponseChunkAssembler.AssembledWorkItem; import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.observers.StreamObserverFactory; import org.apache.beam.runners.dataflow.worker.windmill.client.throttling.ThrottleTimer; import org.apache.beam.runners.dataflow.worker.windmill.work.WorkItemScheduler; @@ -47,13 +43,9 @@ import org.apache.beam.runners.dataflow.worker.windmill.work.refresh.HeartbeatSender; import org.apache.beam.sdk.annotations.Internal; import org.apache.beam.sdk.util.BackOff; -import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.stub.StreamObserver; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Suppliers; -import org.joda.time.Instant; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; /** * Implementation of {@link GetWorkStream} that passes along a specific {@link @@ -66,7 +58,6 @@ public final class GrpcDirectGetWorkStream extends AbstractWindmillStream implements GetWorkStream { - private static final Logger LOG = LoggerFactory.getLogger(GrpcDirectGetWorkStream.class); private static final StreamingGetWorkRequest HEALTH_CHECK_REQUEST = StreamingGetWorkRequest.newBuilder() .setRequestExtension( @@ -90,8 +81,10 @@ public final class GrpcDirectGetWorkStream * Map of stream IDs to their buffers. Used to aggregate streaming gRPC response chunks as they * come in. Once all chunks for a response has been received, the chunk is processed and the * buffer is cleared. + * + * @implNote Buffers are not persisted across stream restarts. */ - private final ConcurrentMap workItemBuffers; + private final ConcurrentMap workItemAssemblers; private GrpcDirectGetWorkStream( String backendWorkerToken, @@ -120,7 +113,7 @@ private GrpcDirectGetWorkStream( this.request = request; this.getWorkThrottleTimer = getWorkThrottleTimer; this.workItemScheduler = workItemScheduler; - this.workItemBuffers = new ConcurrentHashMap<>(); + this.workItemAssemblers = new ConcurrentHashMap<>(); this.heartbeatSender = Suppliers.memoize(heartbeatSender::get); this.workCommitter = Suppliers.memoize(workCommitter::get); this.getDataClient = Suppliers.memoize(getDataClient::get); @@ -163,7 +156,8 @@ public static GrpcDirectGetWorkStream create( return getWorkStream; } - private static Watermarks createWatermarks(WorkItem workItem, ComputationMetadata metadata) { + private static Watermarks createWatermarks( + WorkItem workItem, GetWorkResponseChunkAssembler.ComputationMetadata metadata) { return Watermarks.builder() .setInputDataWatermark(metadata.inputDataWatermark()) .setOutputDataWatermark(workItem.getOutputDataWatermark()) @@ -171,14 +165,8 @@ private static Watermarks createWatermarks(WorkItem workItem, ComputationMetadat .build(); } - private synchronized GetWorkBudget getThenResetBudgetAdjustment() { - return nextBudgetAdjustment.getAndUpdate(unused -> GetWorkBudget.noBudget()); - } - - private void sendRequestExtension() { - // Just sent the request extension, reset the nextBudgetAdjustment. This will be set when - // adjustBudget is called. - GetWorkBudget adjustment = getThenResetBudgetAdjustment(); + private void sendRequestExtension(GetWorkBudget adjustment) { + inFlightBudget.getAndUpdate(budget -> budget.apply(adjustment)); StreamingGetWorkRequest extension = StreamingGetWorkRequest.newBuilder() .setRequestExtension( @@ -200,7 +188,7 @@ private void sendRequestExtension() { @Override protected synchronized void onNewStream() { - workItemBuffers.clear(); + workItemAssemblers.clear(); // Add the current in-flight budget to the next adjustment. Only positive values are allowed // here // with negatives defaulting to 0, since GetWorkBudgets cannot be created with negative values. @@ -229,7 +217,7 @@ public void appendSpecificHtml(PrintWriter writer) { // Number of buffers is same as distinct workers that sent work on this stream. writer.format( "GetWorkStream: %d buffers, %s inflight budget allowed.", - workItemBuffers.size(), inFlightBudget.get()); + workItemAssemblers.size(), inFlightBudget.get()); } @Override @@ -240,27 +228,49 @@ public void sendHealthCheck() { @Override protected void onResponse(StreamingGetWorkResponseChunk chunk) { getWorkThrottleTimer.stop(); - WorkItemBuffer workItemBuffer = - workItemBuffers.computeIfAbsent(chunk.getStreamId(), unused -> new WorkItemBuffer()); - workItemBuffer.append(chunk); + workItemAssemblers + .computeIfAbsent(chunk.getStreamId(), unused -> new GetWorkResponseChunkAssembler()) + .append(chunk) + .ifPresent(this::consumeAssembledWorkItem); + } - // The entire WorkItem has been received, it is ready to be processed. - if (chunk.getRemainingBytesForWorkItem() == 0) { - workItemBuffer.runAndReset(); - // Record the fact that there are now fewer outstanding messages and bytes on the stream. - inFlightBudget.updateAndGet(budget -> budget.subtract(1, workItemBuffer.bufferedSize())); + private void consumeAssembledWorkItem(AssembledWorkItem assembledWorkItem) { + // Record the fact that there are now fewer outstanding messages and bytes on the stream. + inFlightBudget.updateAndGet(budget -> budget.subtract(1, assembledWorkItem.bufferedSize())); + WorkItem workItem = assembledWorkItem.workItem(); + GetWorkResponseChunkAssembler.ComputationMetadata metadata = + assembledWorkItem.computationMetadata(); + pendingResponseBudget.getAndUpdate(budget -> budget.apply(1, workItem.getSerializedSize())); + try { + workItemScheduler.scheduleWork( + workItem, + createWatermarks(workItem, Preconditions.checkNotNull(metadata)), + createProcessingContext(Preconditions.checkNotNull(metadata.computationId())), + assembledWorkItem.latencyAttributions()); + } finally { + pendingResponseBudget.getAndUpdate(budget -> budget.apply(-1, -workItem.getSerializedSize())); } } + private Work.ProcessingContext createProcessingContext(String computationId) { + return Work.createProcessingContext( + computationId, getDataClient.get(), workCommitter.get()::commit, heartbeatSender.get()); + } + @Override protected void startThrottleTimer() { getWorkThrottleTimer.start(); } @Override - public synchronized void adjustBudget(long itemsDelta, long bytesDelta) { - nextBudgetAdjustment.set(nextBudgetAdjustment.get().apply(itemsDelta, bytesDelta)); - sendRequestExtension(); + public void adjustBudget(long itemsDelta, long bytesDelta) { + GetWorkBudget adjustment = + nextBudgetAdjustment + // Get the current value, and reset the nextBudgetAdjustment. This will be set again + // when adjustBudget is called. + .getAndUpdate(unused -> GetWorkBudget.noBudget()) + .apply(itemsDelta, bytesDelta); + sendRequestExtension(adjustment); } @Override @@ -274,74 +284,4 @@ public GetWorkBudget remainingBudget() { .apply(currentNextBudgetAdjustment) .apply(currentInflightBudget); } - - private synchronized void updatePendingResponseBudget(long itemsDelta, long bytesDelta) { - pendingResponseBudget.set(pendingResponseBudget.get().apply(itemsDelta, bytesDelta)); - } - - @AutoValue - abstract static class ComputationMetadata { - private static ComputationMetadata fromProto(ComputationWorkItemMetadata metadataProto) { - return new AutoValue_GrpcDirectGetWorkStream_ComputationMetadata( - metadataProto.getComputationId(), - WindmillTimeUtils.windmillToHarnessWatermark(metadataProto.getInputDataWatermark()), - WindmillTimeUtils.windmillToHarnessWatermark( - metadataProto.getDependentRealtimeInputWatermark())); - } - - abstract String computationId(); - - abstract Instant inputDataWatermark(); - - abstract Instant synchronizedProcessingTime(); - } - - private class WorkItemBuffer { - private final GetWorkTimingInfosTracker workTimingInfosTracker; - private ByteString data; - private @Nullable ComputationMetadata metadata; - - private WorkItemBuffer() { - workTimingInfosTracker = new GetWorkTimingInfosTracker(System::currentTimeMillis); - data = ByteString.EMPTY; - this.metadata = null; - } - - private void append(StreamingGetWorkResponseChunk chunk) { - if (chunk.hasComputationMetadata()) { - this.metadata = ComputationMetadata.fromProto(chunk.getComputationMetadata()); - } - - this.data = data.concat(chunk.getSerializedWorkItem()); - workTimingInfosTracker.addTimingInfo(chunk.getPerWorkItemTimingInfosList()); - } - - private long bufferedSize() { - return data.size(); - } - - private void runAndReset() { - try { - WorkItem workItem = WorkItem.parseFrom(data.newInput()); - updatePendingResponseBudget(1, workItem.getSerializedSize()); - workItemScheduler.scheduleWork( - workItem, - createWatermarks(workItem, Preconditions.checkNotNull(metadata)), - createProcessingContext(Preconditions.checkNotNull(metadata.computationId())), - // After the work item is successfully queued or dropped by ActiveWorkState, remove it - // from the pendingResponseBudget. - queuedWorkItem -> updatePendingResponseBudget(-1, -workItem.getSerializedSize()), - workTimingInfosTracker.getLatencyAttributions()); - } catch (IOException e) { - LOG.error("Failed to parse work item from stream: ", e); - } - workTimingInfosTracker.reset(); - data = ByteString.EMPTY; - } - - private Work.ProcessingContext createProcessingContext(String computationId) { - return Work.createProcessingContext( - computationId, getDataClient.get(), workCommitter.get()::commit, heartbeatSender.get()); - } - } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcDispatcherClient.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcDispatcherClient.java index 033990017b24c..cf2e7260592db 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcDispatcherClient.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcDispatcherClient.java @@ -75,7 +75,7 @@ public static GrpcDispatcherClient create(WindmillStubFactory windmillStubFactor } @VisibleForTesting - static GrpcDispatcherClient forTesting( + public static GrpcDispatcherClient forTesting( WindmillStubFactory windmillGrpcStubFactory, List windmillServiceStubs, List windmillMetadataServiceStubs, @@ -106,7 +106,7 @@ ImmutableSet getDispatcherEndpoints() { } /** Will block the calling thread until the initial endpoints are present. */ - CloudWindmillMetadataServiceV1Alpha1Stub getWindmillMetadataServiceStubBlocking() { + public CloudWindmillMetadataServiceV1Alpha1Stub getWindmillMetadataServiceStubBlocking() { boolean initialized = false; long secondsWaited = 0; while (!initialized) { diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetWorkStream.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetWorkStream.java index 4b392e9190ed2..09ecbf3f30516 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetWorkStream.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcGetWorkStream.java @@ -17,45 +17,34 @@ */ package org.apache.beam.runners.dataflow.worker.windmill.client.grpc; -import java.io.IOException; import java.io.PrintWriter; -import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicLong; import java.util.function.Function; -import javax.annotation.Nullable; -import org.apache.beam.runners.dataflow.worker.WindmillTimeUtils; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GetWorkRequest; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill.LatencyAttribution; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.StreamingGetWorkRequest; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.StreamingGetWorkRequestExtension; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.StreamingGetWorkResponseChunk; import org.apache.beam.runners.dataflow.worker.windmill.client.AbstractWindmillStream; import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetWorkStream; +import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.GetWorkResponseChunkAssembler.AssembledWorkItem; import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.observers.StreamObserverFactory; import org.apache.beam.runners.dataflow.worker.windmill.client.throttling.ThrottleTimer; import org.apache.beam.runners.dataflow.worker.windmill.work.WorkItemReceiver; import org.apache.beam.runners.dataflow.worker.windmill.work.budget.GetWorkBudget; import org.apache.beam.sdk.util.BackOff; -import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.stub.StreamObserver; -import org.joda.time.Instant; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -public final class GrpcGetWorkStream +final class GrpcGetWorkStream extends AbstractWindmillStream implements GetWorkStream { - private static final Logger LOG = LoggerFactory.getLogger(GrpcGetWorkStream.class); - private final GetWorkRequest request; private final WorkItemReceiver receiver; private final ThrottleTimer getWorkThrottleTimer; - private final Map buffers; + private final Map workItemAssemblers; private final AtomicLong inflightMessages; private final AtomicLong inflightBytes; @@ -83,7 +72,7 @@ private GrpcGetWorkStream( this.request = request; this.getWorkThrottleTimer = getWorkThrottleTimer; this.receiver = receiver; - this.buffers = new ConcurrentHashMap<>(); + this.workItemAssemblers = new ConcurrentHashMap<>(); this.inflightMessages = new AtomicLong(); this.inflightBytes = new AtomicLong(); } @@ -138,7 +127,7 @@ private void sendRequestExtension(long moreItems, long moreBytes) { @Override protected synchronized void onNewStream() { - buffers.clear(); + workItemAssemblers.clear(); inflightMessages.set(request.getMaxItems()); inflightBytes.set(request.getMaxBytes()); send(StreamingGetWorkRequest.newBuilder().setRequest(request).build()); @@ -154,7 +143,7 @@ public void appendSpecificHtml(PrintWriter writer) { // Number of buffers is same as distinct workers that sent work on this stream. writer.format( "GetWorkStream: %d buffers, %d inflight messages allowed, %d inflight bytes allowed", - buffers.size(), inflightMessages.intValue(), inflightBytes.intValue()); + workItemAssemblers.size(), inflightMessages.intValue(), inflightBytes.intValue()); } @Override @@ -169,30 +158,33 @@ public void sendHealthCheck() { @Override protected void onResponse(StreamingGetWorkResponseChunk chunk) { getWorkThrottleTimer.stop(); + workItemAssemblers + .computeIfAbsent(chunk.getStreamId(), unused -> new GetWorkResponseChunkAssembler()) + .append(chunk) + .ifPresent(this::consumeAssembledWorkItem); + } - GrpcGetWorkStream.WorkItemBuffer buffer = - buffers.computeIfAbsent( - chunk.getStreamId(), unused -> new GrpcGetWorkStream.WorkItemBuffer()); - buffer.append(chunk); - - if (chunk.getRemainingBytesForWorkItem() == 0) { - long size = buffer.bufferedSize(); - buffer.runAndReset(); - - // Record the fact that there are now fewer outstanding messages and bytes on the stream. - long numInflight = inflightMessages.decrementAndGet(); - long bytesInflight = inflightBytes.addAndGet(-size); - - // If the outstanding items or bytes limit has gotten too low, top both off with a - // GetWorkExtension. The goal is to keep the limits relatively close to their maximum - // values without sending too many extension requests. - if (numInflight < request.getMaxItems() / 2 || bytesInflight < request.getMaxBytes() / 2) { - long moreItems = request.getMaxItems() - numInflight; - long moreBytes = request.getMaxBytes() - bytesInflight; - inflightMessages.getAndAdd(moreItems); - inflightBytes.getAndAdd(moreBytes); - sendRequestExtension(moreItems, moreBytes); - } + private void consumeAssembledWorkItem(AssembledWorkItem assembledWorkItem) { + receiver.receiveWork( + assembledWorkItem.computationMetadata().computationId(), + assembledWorkItem.computationMetadata().inputDataWatermark(), + assembledWorkItem.computationMetadata().synchronizedProcessingTime(), + assembledWorkItem.workItem(), + assembledWorkItem.latencyAttributions()); + + // Record the fact that there are now fewer outstanding messages and bytes on the stream. + long numInflight = inflightMessages.decrementAndGet(); + long bytesInflight = inflightBytes.addAndGet(-assembledWorkItem.bufferedSize()); + + // If the outstanding items or bytes limit has gotten too low, top both off with a + // GetWorkExtension. The goal is to keep the limits relatively close to their maximum + // values without sending too many extension requests. + if (numInflight < request.getMaxItems() / 2 || bytesInflight < request.getMaxBytes() / 2) { + long moreItems = request.getMaxItems() - numInflight; + long moreBytes = request.getMaxBytes() - bytesInflight; + inflightMessages.getAndAdd(moreItems); + inflightBytes.getAndAdd(moreBytes); + sendRequestExtension(moreItems, moreBytes); } } @@ -213,63 +205,4 @@ public GetWorkBudget remainingBudget() { .setItems(request.getMaxItems() - inflightMessages.get()) .build(); } - - private class WorkItemBuffer { - private final GetWorkTimingInfosTracker workTimingInfosTracker; - private String computation; - @Nullable private Instant inputDataWatermark; - @Nullable private Instant synchronizedProcessingTime; - private ByteString data; - private long bufferedSize; - - @SuppressWarnings("initialization.fields.uninitialized") - WorkItemBuffer() { - workTimingInfosTracker = new GetWorkTimingInfosTracker(System::currentTimeMillis); - data = ByteString.EMPTY; - bufferedSize = 0; - } - - @SuppressWarnings("NullableProblems") - private void setMetadata(Windmill.ComputationWorkItemMetadata metadata) { - this.computation = metadata.getComputationId(); - this.inputDataWatermark = - WindmillTimeUtils.windmillToHarnessWatermark(metadata.getInputDataWatermark()); - this.synchronizedProcessingTime = - WindmillTimeUtils.windmillToHarnessWatermark( - metadata.getDependentRealtimeInputWatermark()); - } - - private void append(StreamingGetWorkResponseChunk chunk) { - if (chunk.hasComputationMetadata()) { - setMetadata(chunk.getComputationMetadata()); - } - - this.data = data.concat(chunk.getSerializedWorkItem()); - this.bufferedSize += chunk.getSerializedWorkItem().size(); - workTimingInfosTracker.addTimingInfo(chunk.getPerWorkItemTimingInfosList()); - } - - private long bufferedSize() { - return bufferedSize; - } - - private void runAndReset() { - try { - Windmill.WorkItem workItem = Windmill.WorkItem.parseFrom(data.newInput()); - List getWorkStreamLatencies = - workTimingInfosTracker.getLatencyAttributions(); - receiver.receiveWork( - computation, - inputDataWatermark, - synchronizedProcessingTime, - workItem, - getWorkStreamLatencies); - } catch (IOException e) { - LOG.error("Failed to parse work item from stream: ", e); - } - workTimingInfosTracker.reset(); - data = ByteString.EMPTY; - bufferedSize = 0; - } - } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/WorkItemScheduler.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/WorkItemScheduler.java index 17c9f7d80d5da..00784493fe3df 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/WorkItemScheduler.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/WorkItemScheduler.java @@ -18,7 +18,6 @@ package org.apache.beam.runners.dataflow.worker.windmill.work; import java.util.Collection; -import java.util.function.Consumer; import javax.annotation.CheckReturnValue; import org.apache.beam.runners.dataflow.worker.streaming.Watermarks; import org.apache.beam.runners.dataflow.worker.streaming.Work; @@ -36,8 +35,6 @@ public interface WorkItemScheduler { * @param workItem {@link WorkItem} to be processed. * @param watermarks processing watermarks for the workItem. * @param processingContext for processing the workItem. - * @param ackWorkItemQueued Called after an attempt to queue the work item for processing. Used to - * free up pending budget. * @param getWorkStreamLatencies Latencies per processing stage for the WorkItem for reporting * back to Streaming Engine backend. */ @@ -45,6 +42,5 @@ void scheduleWork( WorkItem workItem, Watermarks watermarks, Work.ProcessingContext processingContext, - Consumer ackWorkItemQueued, Collection getWorkStreamLatencies); } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/budget/EvenGetWorkBudgetDistributor.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/budget/EvenGetWorkBudgetDistributor.java index 3a17222d3e6bd..403bb99efb4c5 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/budget/EvenGetWorkBudgetDistributor.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/budget/EvenGetWorkBudgetDistributor.java @@ -26,14 +26,13 @@ import java.util.Map.Entry; import java.util.function.Function; import java.util.function.Supplier; -import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.WindmillStreamSender; import org.apache.beam.sdk.annotations.Internal; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableCollection; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -/** Evenly distributes the provided budget across the available {@link WindmillStreamSender}(s). */ +/** Evenly distributes the provided budget across the available {@link GetWorkBudgetSpender}(s). */ @Internal final class EvenGetWorkBudgetDistributor implements GetWorkBudgetDistributor { private static final Logger LOG = LoggerFactory.getLogger(EvenGetWorkBudgetDistributor.class); @@ -50,10 +49,10 @@ private static boolean isBelowFiftyPercentOfTarget( } @Override - public void distributeBudget( - ImmutableCollection streams, GetWorkBudget getWorkBudget) { - if (streams.isEmpty()) { - LOG.debug("Cannot distribute budget to no streams."); + public void distributeBudget( + ImmutableCollection budgetOwners, GetWorkBudget getWorkBudget) { + if (budgetOwners.isEmpty()) { + LOG.debug("Cannot distribute budget to no owners."); return; } @@ -62,23 +61,21 @@ public void distributeBudget( return; } - Map desiredBudgets = - computeDesiredBudgets(streams, getWorkBudget); + Map desiredBudgets = computeDesiredBudgets(budgetOwners, getWorkBudget); - for (Entry streamAndDesiredBudget : - desiredBudgets.entrySet()) { - WindmillStreamSender stream = streamAndDesiredBudget.getKey(); + for (Entry streamAndDesiredBudget : desiredBudgets.entrySet()) { + GetWorkBudgetSpender getWorkBudgetSpender = streamAndDesiredBudget.getKey(); GetWorkBudget desired = streamAndDesiredBudget.getValue(); - GetWorkBudget remaining = stream.remainingGetWorkBudget(); + GetWorkBudget remaining = getWorkBudgetSpender.remainingBudget(); if (isBelowFiftyPercentOfTarget(remaining, desired)) { GetWorkBudget adjustment = desired.subtract(remaining); - stream.adjustBudget(adjustment); + getWorkBudgetSpender.adjustBudget(adjustment); } } } - private ImmutableMap computeDesiredBudgets( - ImmutableCollection streams, GetWorkBudget totalGetWorkBudget) { + private ImmutableMap computeDesiredBudgets( + ImmutableCollection streams, GetWorkBudget totalGetWorkBudget) { GetWorkBudget activeWorkBudget = activeWorkBudgetSupplier.get(); LOG.info("Current active work budget: {}", activeWorkBudget); // TODO: Fix possibly non-deterministic handing out of budgets. diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/budget/GetWorkBudgetDistributor.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/budget/GetWorkBudgetDistributor.java index 3ec9718e041e5..d21de17e522c5 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/budget/GetWorkBudgetDistributor.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/budget/GetWorkBudgetDistributor.java @@ -17,7 +17,6 @@ */ package org.apache.beam.runners.dataflow.worker.windmill.work.budget; -import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.WindmillStreamSender; import org.apache.beam.sdk.annotations.Internal; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableCollection; @@ -28,6 +27,6 @@ */ @Internal public interface GetWorkBudgetDistributor { - void distributeBudget( - ImmutableCollection streams, GetWorkBudget getWorkBudget); + void distributeBudget( + ImmutableCollection streams, GetWorkBudget getWorkBudget); } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/budget/GetWorkBudgetSpender.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/budget/GetWorkBudgetSpender.java new file mode 100644 index 0000000000000..254b2589062ef --- /dev/null +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/budget/GetWorkBudgetSpender.java @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.dataflow.worker.windmill.work.budget; + +/** + * Represents something that spends {@link + * org.apache.beam.runners.dataflow.worker.windmill.work.budget.GetWorkBudget} + */ +public interface GetWorkBudgetSpender { + void adjustBudget(long itemsDelta, long bytesDelta); + + default void adjustBudget(GetWorkBudget adjustment) { + adjustBudget(adjustment.items(), adjustment.bytes()); + } + + GetWorkBudget remainingBudget(); +} diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorkerTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorkerTest.java index 5855057c4210c..d16ed2942fd9c 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorkerTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/StreamingDataflowWorkerTest.java @@ -2182,7 +2182,7 @@ public void testMergeWindowsCaching() throws Exception { // No input messages assertEquals(0L, splitIntToLong(getCounter(counters, "WindmillShuffleBytesRead").getInteger())); - CacheStats stats = worker.stateCache.getCacheStats(); + CacheStats stats = worker.getStateCacheStats(); LOG.info("cache stats {}", stats); assertEquals(1, stats.hitCount()); assertEquals(4, stats.missCount()); diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/StreamingEngineClientTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/harness/FanOutStreamingEngineWorkerHarnessTest.java similarity index 93% rename from runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/StreamingEngineClientTest.java rename to runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/harness/FanOutStreamingEngineWorkerHarnessTest.java index 1999dbe319027..aaa71b6598ea2 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/StreamingEngineClientTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/harness/FanOutStreamingEngineWorkerHarnessTest.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.beam.runners.dataflow.worker.windmill.client.grpc; +package org.apache.beam.runners.dataflow.worker.streaming.harness; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; @@ -49,12 +49,15 @@ import org.apache.beam.runners.dataflow.worker.windmill.WindmillServiceAddress; import org.apache.beam.runners.dataflow.worker.windmill.client.commits.WorkCommitter; import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.ThrottlingGetDataMetricTracker; +import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.GrpcDispatcherClient; +import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.GrpcWindmillStreamFactory; import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.stubs.ChannelCachingStubFactory; import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.stubs.WindmillChannelFactory; import org.apache.beam.runners.dataflow.worker.windmill.testing.FakeWindmillStubFactory; import org.apache.beam.runners.dataflow.worker.windmill.work.WorkItemScheduler; import org.apache.beam.runners.dataflow.worker.windmill.work.budget.GetWorkBudget; import org.apache.beam.runners.dataflow.worker.windmill.work.budget.GetWorkBudgetDistributor; +import org.apache.beam.runners.dataflow.worker.windmill.work.budget.GetWorkBudgetSpender; import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.Server; import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.inprocess.InProcessServerBuilder; import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.inprocess.InProcessSocketAddress; @@ -76,7 +79,7 @@ import org.junit.runners.JUnit4; @RunWith(JUnit4.class) -public class StreamingEngineClientTest { +public class FanOutStreamingEngineWorkerHarnessTest { private static final WindmillServiceAddress DEFAULT_WINDMILL_SERVICE_ADDRESS = WindmillServiceAddress.create(HostAndPort.fromParts(WindmillChannelFactory.LOCALHOST, 443)); private static final ImmutableMap DEFAULT = @@ -113,14 +116,10 @@ public class StreamingEngineClientTest { private Server fakeStreamingEngineServer; private CountDownLatch getWorkerMetadataReady; private GetWorkerMetadataTestStub fakeGetWorkerMetadataStub; - private StreamingEngineClient streamingEngineClient; + private FanOutStreamingEngineWorkerHarness fanOutStreamingEngineWorkProvider; private static WorkItemScheduler noOpProcessWorkItemFn() { - return (workItem, - watermarks, - processingContext, - ackWorkItemQueued, - getWorkStreamLatencies) -> {}; + return (workItem, watermarks, processingContext, getWorkStreamLatencies) -> {}; } private static GetWorkRequest getWorkRequest(long items, long bytes) { @@ -163,16 +162,16 @@ public void setUp() throws IOException { @After public void cleanUp() { - Preconditions.checkNotNull(streamingEngineClient).finish(); + Preconditions.checkNotNull(fanOutStreamingEngineWorkProvider).shutdown(); fakeStreamingEngineServer.shutdownNow(); stubFactory.shutdown(); } - private StreamingEngineClient newStreamingEngineClient( + private FanOutStreamingEngineWorkerHarness newStreamingEngineClient( GetWorkBudget getWorkBudget, GetWorkBudgetDistributor getWorkBudgetDistributor, WorkItemScheduler workItemScheduler) { - return StreamingEngineClient.forTesting( + return FanOutStreamingEngineWorkerHarness.forTesting( JOB_HEADER, getWorkBudget, streamFactory, @@ -194,7 +193,7 @@ public void testStreamsStartCorrectly() throws InterruptedException { TestGetWorkBudgetDistributor getWorkBudgetDistributor = spy(new TestGetWorkBudgetDistributor(numBudgetDistributionsExpected)); - streamingEngineClient = + fanOutStreamingEngineWorkProvider = newStreamingEngineClient( GetWorkBudget.builder().setItems(items).setBytes(bytes).build(), getWorkBudgetDistributor, @@ -216,7 +215,7 @@ public void testStreamsStartCorrectly() throws InterruptedException { waitForWorkerMetadataToBeConsumed(getWorkBudgetDistributor); StreamingEngineConnectionState currentConnections = - streamingEngineClient.getCurrentConnections(); + fanOutStreamingEngineWorkProvider.getCurrentConnections(); assertEquals(2, currentConnections.windmillConnections().size()); assertEquals(2, currentConnections.windmillStreams().size()); @@ -250,7 +249,7 @@ public void testStreamsStartCorrectly() throws InterruptedException { public void testScheduledBudgetRefresh() throws InterruptedException { TestGetWorkBudgetDistributor getWorkBudgetDistributor = spy(new TestGetWorkBudgetDistributor(2)); - streamingEngineClient = + fanOutStreamingEngineWorkProvider = newStreamingEngineClient( GetWorkBudget.builder().setItems(1L).setBytes(1L).build(), getWorkBudgetDistributor, @@ -273,7 +272,7 @@ public void testOnNewWorkerMetadata_correctlyRemovesStaleWindmillServers() int metadataCount = 2; TestGetWorkBudgetDistributor getWorkBudgetDistributor = spy(new TestGetWorkBudgetDistributor(metadataCount)); - streamingEngineClient = + fanOutStreamingEngineWorkProvider = newStreamingEngineClient( GetWorkBudget.builder().setItems(1).setBytes(1).build(), getWorkBudgetDistributor, @@ -311,11 +310,12 @@ public void testOnNewWorkerMetadata_correctlyRemovesStaleWindmillServers() fakeGetWorkerMetadataStub.injectWorkerMetadata(secondWorkerMetadata); waitForWorkerMetadataToBeConsumed(getWorkBudgetDistributor); StreamingEngineConnectionState currentConnections = - streamingEngineClient.getCurrentConnections(); + fanOutStreamingEngineWorkProvider.getCurrentConnections(); assertEquals(1, currentConnections.windmillConnections().size()); assertEquals(1, currentConnections.windmillStreams().size()); Set workerTokens = - streamingEngineClient.getCurrentConnections().windmillConnections().values().stream() + fanOutStreamingEngineWorkProvider.getCurrentConnections().windmillConnections().values() + .stream() .map(WindmillConnection::backendWorkerToken) .collect(Collectors.toSet()); @@ -362,7 +362,7 @@ public void testOnNewWorkerMetadata_redistributesBudget() throws InterruptedExce TestGetWorkBudgetDistributor getWorkBudgetDistributor = spy(new TestGetWorkBudgetDistributor(workerMetadataResponses.size())); - streamingEngineClient = + fanOutStreamingEngineWorkProvider = newStreamingEngineClient( GetWorkBudget.builder().setItems(1).setBytes(1).build(), getWorkBudgetDistributor, @@ -439,8 +439,8 @@ private void waitForBudgetDistribution() throws InterruptedException { } @Override - public void distributeBudget( - ImmutableCollection streams, GetWorkBudget getWorkBudget) { + public void distributeBudget( + ImmutableCollection streams, GetWorkBudget getWorkBudget) { streams.forEach(stream -> stream.adjustBudget(getWorkBudget.items(), getWorkBudget.bytes())); getWorkBudgetDistributorTriggered.countDown(); } diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/WindmillStreamSenderTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/harness/WindmillStreamSenderTest.java similarity index 97% rename from runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/WindmillStreamSenderTest.java rename to runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/harness/WindmillStreamSenderTest.java index 9d49c3ef3146d..dc6cc5641055a 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/WindmillStreamSenderTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/streaming/harness/WindmillStreamSenderTest.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.beam.runners.dataflow.worker.windmill.client.grpc; +package org.apache.beam.runners.dataflow.worker.streaming.harness; import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.eq; @@ -35,6 +35,7 @@ import org.apache.beam.runners.dataflow.worker.windmill.client.WindmillStream.GetWorkStream; import org.apache.beam.runners.dataflow.worker.windmill.client.commits.WorkCommitter; import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.GetDataClient; +import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.GrpcWindmillStreamFactory; import org.apache.beam.runners.dataflow.worker.windmill.client.throttling.ThrottleTimer; import org.apache.beam.runners.dataflow.worker.windmill.work.WorkItemScheduler; import org.apache.beam.runners.dataflow.worker.windmill.work.budget.GetWorkBudget; @@ -64,7 +65,7 @@ public class WindmillStreamSenderTest { .build()) .build()); private final WorkItemScheduler workItemScheduler = - (workItem, watermarks, processingContext, ackWorkItemQueued, getWorkStreamLatencies) -> {}; + (workItem, watermarks, processingContext, getWorkStreamLatencies) -> {}; @Rule public transient Timeout globalTimeout = Timeout.seconds(600); private ManagedChannel inProcessChannel; private WindmillConnection connection; diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/ChannelzServletTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/ChannelzServletTest.java index 96c675169a7d2..d234cf424767b 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/ChannelzServletTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/ChannelzServletTest.java @@ -56,7 +56,8 @@ public void testRendersAllChannels() throws UnsupportedEncodingException { fakeWindmillServer.setWindmillServiceEndpoints( ImmutableSet.of(HostAndPort.fromHost(windmill1), HostAndPort.fromHost(windmill2))); options.setChannelzShowOnlyWindmillServiceChannels(false); - ChannelzServlet channelzServlet = new ChannelzServlet("/channelz", options, fakeWindmillServer); + ChannelzServlet channelzServlet = + new ChannelzServlet("/channelz", options, fakeWindmillServer::getWindmillServiceEndpoints); StringWriter stringWriter = new StringWriter(); PrintWriter writer = new PrintWriter(stringWriter); channelzServlet.captureData(writer); @@ -88,7 +89,8 @@ public void testRendersOnlyWindmillChannels() throws UnsupportedEncodingExceptio fakeWindmillServer.setWindmillServiceEndpoints( ImmutableSet.of(HostAndPort.fromHost(windmill1), HostAndPort.fromHost(windmill2))); options.setChannelzShowOnlyWindmillServiceChannels(true); - ChannelzServlet channelzServlet = new ChannelzServlet("/channelz", options, fakeWindmillServer); + ChannelzServlet channelzServlet = + new ChannelzServlet("/channelz", options, fakeWindmillServer::getWindmillServiceEndpoints); StringWriter stringWriter = new StringWriter(); PrintWriter writer = new PrintWriter(stringWriter); channelzServlet.captureData(writer); diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillServerTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillServerTest.java index 5cfc19ac07dfd..7e5801b65de47 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillServerTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/GrpcWindmillServerTest.java @@ -1142,7 +1142,13 @@ public void onNext(StreamingGetWorkRequest request) { StreamingGetWorkResponseChunk.newBuilder() .setStreamId(id) .setSerializedWorkItem(serializedResponse) - .setRemainingBytesForWorkItem(0); + .setRemainingBytesForWorkItem(0) + .setComputationMetadata( + ComputationWorkItemMetadata.newBuilder() + .setComputationId("computation") + .setInputDataWatermark(1L) + .setDependentRealtimeInputWatermark(1L) + .build()); try { responseObserver.onNext(builder.build()); } catch (IllegalStateException e) { @@ -1175,9 +1181,7 @@ public void onCompleted() { @Nullable Instant inputDataWatermark, Instant synchronizedProcessingTime, Windmill.WorkItem workItem, - Collection getWorkStreamLatencies) -> { - latch.countDown(); - }); + Collection getWorkStreamLatencies) -> latch.countDown()); // Wait for 100 items or 30 seconds. assertTrue(latch.await(30, TimeUnit.SECONDS)); // Confirm that we report at least as much throttle time as our server sent errors for. We will diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/budget/EvenGetWorkBudgetDistributorTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/budget/EvenGetWorkBudgetDistributorTest.java index b0c305dc4ec45..3cda4559c100b 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/budget/EvenGetWorkBudgetDistributorTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/budget/EvenGetWorkBudgetDistributorTest.java @@ -19,7 +19,6 @@ import static org.mockito.ArgumentMatchers.anyLong; import static org.mockito.ArgumentMatchers.eq; -import static org.mockito.Mockito.mock; import static org.mockito.Mockito.never; import static org.mockito.Mockito.spy; import static org.mockito.Mockito.times; @@ -28,20 +27,8 @@ import java.util.ArrayList; import java.util.List; -import org.apache.beam.runners.dataflow.worker.windmill.CloudWindmillServiceV1Alpha1Grpc; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill; -import org.apache.beam.runners.dataflow.worker.windmill.Windmill.JobHeader; -import org.apache.beam.runners.dataflow.worker.windmill.WindmillConnection; -import org.apache.beam.runners.dataflow.worker.windmill.client.commits.WorkCommitter; -import org.apache.beam.runners.dataflow.worker.windmill.client.getdata.GetDataClient; -import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.GrpcWindmillStreamFactory; -import org.apache.beam.runners.dataflow.worker.windmill.client.grpc.WindmillStreamSender; -import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.ManagedChannel; -import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.inprocess.InProcessChannelBuilder; import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.testing.GrpcCleanupRule; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; -import org.junit.After; -import org.junit.Before; import org.junit.Rule; import org.junit.Test; import org.junit.rules.Timeout; @@ -52,8 +39,6 @@ public class EvenGetWorkBudgetDistributorTest { @Rule public final GrpcCleanupRule grpcCleanup = new GrpcCleanupRule(); @Rule public transient Timeout globalTimeout = Timeout.seconds(600); - private ManagedChannel inProcessChannel; - private CloudWindmillServiceV1Alpha1Grpc.CloudWindmillServiceV1Alpha1Stub stub; private static GetWorkBudgetDistributor createBudgetDistributor(GetWorkBudget activeWorkBudget) { return GetWorkBudgetDistributors.distributeEvenly(() -> activeWorkBudget); @@ -67,20 +52,6 @@ private static GetWorkBudgetDistributor createBudgetDistributor(long activeWorkI .build()); } - @Before - public void setUp() { - inProcessChannel = - grpcCleanup.register( - InProcessChannelBuilder.forName("WindmillStreamSenderTest").directExecutor().build()); - grpcCleanup.register(inProcessChannel); - stub = CloudWindmillServiceV1Alpha1Grpc.newStub(inProcessChannel); - } - - @After - public void cleanUp() { - inProcessChannel.shutdownNow(); - } - @Test public void testDistributeBudget_doesNothingWhenPassedInStreamsEmpty() { createBudgetDistributor(1L) @@ -90,38 +61,40 @@ public void testDistributeBudget_doesNothingWhenPassedInStreamsEmpty() { @Test public void testDistributeBudget_doesNothingWithNoBudget() { - WindmillStreamSender windmillStreamSender = - spy(createWindmillStreamSender(GetWorkBudget.noBudget())); + GetWorkBudgetSpender getWorkBudgetSpender = + spy(createGetWorkBudgetOwnerWithRemainingBudgetOf(GetWorkBudget.noBudget())); createBudgetDistributor(1L) - .distributeBudget(ImmutableList.of(windmillStreamSender), GetWorkBudget.noBudget()); - verifyNoInteractions(windmillStreamSender); + .distributeBudget(ImmutableList.of(getWorkBudgetSpender), GetWorkBudget.noBudget()); + verifyNoInteractions(getWorkBudgetSpender); } @Test public void testDistributeBudget_doesNotAdjustStreamBudgetWhenRemainingBudgetHighNoActiveWork() { - WindmillStreamSender windmillStreamSender = + GetWorkBudgetSpender getWorkBudgetSpender = spy( - createWindmillStreamSender( + createGetWorkBudgetOwnerWithRemainingBudgetOf( GetWorkBudget.builder().setItems(10L).setBytes(10L).build())); createBudgetDistributor(0L) .distributeBudget( - ImmutableList.of(windmillStreamSender), + ImmutableList.of(getWorkBudgetSpender), GetWorkBudget.builder().setItems(10L).setBytes(10L).build()); - verify(windmillStreamSender, never()).adjustBudget(anyLong(), anyLong()); + verify(getWorkBudgetSpender, never()).adjustBudget(anyLong(), anyLong()); } @Test public void testDistributeBudget_doesNotAdjustStreamBudgetWhenRemainingBudgetHighWithActiveWork() { - WindmillStreamSender windmillStreamSender = - spy(createWindmillStreamSender(GetWorkBudget.builder().setItems(5L).setBytes(5L).build())); + GetWorkBudgetSpender getWorkBudgetSpender = + spy( + createGetWorkBudgetOwnerWithRemainingBudgetOf( + GetWorkBudget.builder().setItems(5L).setBytes(5L).build())); createBudgetDistributor(10L) .distributeBudget( - ImmutableList.of(windmillStreamSender), + ImmutableList.of(getWorkBudgetSpender), GetWorkBudget.builder().setItems(20L).setBytes(20L).build()); - verify(windmillStreamSender, never()).adjustBudget(anyLong(), anyLong()); + verify(getWorkBudgetSpender, never()).adjustBudget(anyLong(), anyLong()); } @Test @@ -130,12 +103,12 @@ public void testDistributeBudget_doesNotAdjustStreamBudgetWhenRemainingBudgetHig GetWorkBudget streamRemainingBudget = GetWorkBudget.builder().setItems(1L).setBytes(10L).build(); GetWorkBudget totalGetWorkBudget = GetWorkBudget.builder().setItems(10L).setBytes(10L).build(); - WindmillStreamSender windmillStreamSender = - spy(createWindmillStreamSender(streamRemainingBudget)); + GetWorkBudgetSpender getWorkBudgetSpender = + spy(createGetWorkBudgetOwnerWithRemainingBudgetOf(streamRemainingBudget)); createBudgetDistributor(0L) - .distributeBudget(ImmutableList.of(windmillStreamSender), totalGetWorkBudget); + .distributeBudget(ImmutableList.of(getWorkBudgetSpender), totalGetWorkBudget); - verify(windmillStreamSender, times(1)) + verify(getWorkBudgetSpender, times(1)) .adjustBudget( eq(totalGetWorkBudget.items() - streamRemainingBudget.items()), eq(totalGetWorkBudget.bytes() - streamRemainingBudget.bytes())); @@ -148,12 +121,12 @@ public void testDistributeBudget_doesNotAdjustStreamBudgetWhenRemainingBudgetHig GetWorkBudget.builder().setItems(1L).setBytes(10L).build(); GetWorkBudget totalGetWorkBudget = GetWorkBudget.builder().setItems(10L).setBytes(10L).build(); long activeWorkItemsAndBytes = 2L; - WindmillStreamSender windmillStreamSender = - spy(createWindmillStreamSender(streamRemainingBudget)); + GetWorkBudgetSpender getWorkBudgetSpender = + spy(createGetWorkBudgetOwnerWithRemainingBudgetOf(streamRemainingBudget)); createBudgetDistributor(activeWorkItemsAndBytes) - .distributeBudget(ImmutableList.of(windmillStreamSender), totalGetWorkBudget); + .distributeBudget(ImmutableList.of(getWorkBudgetSpender), totalGetWorkBudget); - verify(windmillStreamSender, times(1)) + verify(getWorkBudgetSpender, times(1)) .adjustBudget( eq( totalGetWorkBudget.items() @@ -167,12 +140,12 @@ public void testDistributeBudget_adjustsStreamBudgetWhenRemainingByteBudgetTooLo GetWorkBudget streamRemainingBudget = GetWorkBudget.builder().setItems(10L).setBytes(1L).build(); GetWorkBudget totalGetWorkBudget = GetWorkBudget.builder().setItems(10L).setBytes(10L).build(); - WindmillStreamSender windmillStreamSender = - spy(createWindmillStreamSender(streamRemainingBudget)); + GetWorkBudgetSpender getWorkBudgetSpender = + spy(createGetWorkBudgetOwnerWithRemainingBudgetOf(streamRemainingBudget)); createBudgetDistributor(0L) - .distributeBudget(ImmutableList.of(windmillStreamSender), totalGetWorkBudget); + .distributeBudget(ImmutableList.of(getWorkBudgetSpender), totalGetWorkBudget); - verify(windmillStreamSender, times(1)) + verify(getWorkBudgetSpender, times(1)) .adjustBudget( eq(totalGetWorkBudget.items() - streamRemainingBudget.items()), eq(totalGetWorkBudget.bytes() - streamRemainingBudget.bytes())); @@ -186,12 +159,12 @@ public void testDistributeBudget_adjustsStreamBudgetWhenRemainingByteBudgetTooLo GetWorkBudget totalGetWorkBudget = GetWorkBudget.builder().setItems(10L).setBytes(10L).build(); long activeWorkItemsAndBytes = 2L; - WindmillStreamSender windmillStreamSender = - spy(createWindmillStreamSender(streamRemainingBudget)); + GetWorkBudgetSpender getWorkBudgetSpender = + spy(createGetWorkBudgetOwnerWithRemainingBudgetOf(streamRemainingBudget)); createBudgetDistributor(activeWorkItemsAndBytes) - .distributeBudget(ImmutableList.of(windmillStreamSender), totalGetWorkBudget); + .distributeBudget(ImmutableList.of(getWorkBudgetSpender), totalGetWorkBudget); - verify(windmillStreamSender, times(1)) + verify(getWorkBudgetSpender, times(1)) .adjustBudget( eq(totalGetWorkBudget.items() - streamRemainingBudget.items()), eq( @@ -203,9 +176,9 @@ public void testDistributeBudget_adjustsStreamBudgetWhenRemainingByteBudgetTooLo @Test public void testDistributeBudget_distributesBudgetEvenlyIfPossible() { long totalItemsAndBytes = 10L; - List streams = new ArrayList<>(); + List streams = new ArrayList<>(); for (int i = 0; i < totalItemsAndBytes; i++) { - streams.add(spy(createWindmillStreamSender(GetWorkBudget.noBudget()))); + streams.add(spy(createGetWorkBudgetOwnerWithRemainingBudgetOf(GetWorkBudget.noBudget()))); } createBudgetDistributor(0L) .distributeBudget( @@ -225,9 +198,9 @@ public void testDistributeBudget_distributesBudgetEvenlyIfPossible() { @Test public void testDistributeBudget_distributesFairlyWhenNotEven() { long totalItemsAndBytes = 10L; - List streams = new ArrayList<>(); + List streams = new ArrayList<>(); for (int i = 0; i < 3; i++) { - streams.add(spy(createWindmillStreamSender(GetWorkBudget.noBudget()))); + streams.add(spy(createGetWorkBudgetOwnerWithRemainingBudgetOf(GetWorkBudget.noBudget()))); } createBudgetDistributor(0L) .distributeBudget( @@ -244,24 +217,17 @@ public void testDistributeBudget_distributesFairlyWhenNotEven() { .adjustBudget(eq(itemsAndBytesPerStream), eq(itemsAndBytesPerStream))); } - private WindmillStreamSender createWindmillStreamSender(GetWorkBudget getWorkBudget) { - return WindmillStreamSender.create( - WindmillConnection.builder().setStub(stub).build(), - Windmill.GetWorkRequest.newBuilder() - .setClientId(1L) - .setJobId("job") - .setProjectId("project") - .build(), - getWorkBudget, - GrpcWindmillStreamFactory.of( - JobHeader.newBuilder() - .setJobId("job") - .setProjectId("project") - .setWorkerId("worker") - .build()) - .build(), - (workItem, watermarks, processingContext, ackWorkItemQueued, getWorkStreamLatencies) -> {}, - ignored -> mock(GetDataClient.class), - ignored -> mock(WorkCommitter.class)); + private GetWorkBudgetSpender createGetWorkBudgetOwnerWithRemainingBudgetOf( + GetWorkBudget getWorkBudget) { + return spy( + new GetWorkBudgetSpender() { + @Override + public void adjustBudget(long itemsDelta, long bytesDelta) {} + + @Override + public GetWorkBudget remainingBudget() { + return getWorkBudget; + } + }); } } From bfc64d5c14adea209364d48b0fe9b4e8ba6eaab5 Mon Sep 17 00:00:00 2001 From: scwhittle Date: Mon, 5 Aug 2024 11:34:45 +0200 Subject: [PATCH 09/78] Fix error when ActiveWorkRefresher processed empty heartbeat map. (#32078) --- .../work/refresh/ActiveWorkRefresher.java | 3 ++ .../work/refresh/ActiveWorkRefresherTest.java | 38 +++++++++++++++++-- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresher.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresher.java index 499d2e5b6943c..781285def020e 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresher.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresher.java @@ -130,6 +130,9 @@ private void refreshActiveWork() { Instant refreshDeadline = clock.get().minus(Duration.millis(activeWorkRefreshPeriodMillis)); Map heartbeatsBySender = aggregateHeartbeatsBySender(refreshDeadline); + if (heartbeatsBySender.isEmpty()) { + return; + } List> fanOutRefreshActiveWork = new ArrayList<>(); diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresherTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresherTest.java index 9dce3392c60c5..5efb2421fe607 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresherTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/work/refresh/ActiveWorkRefresherTest.java @@ -18,6 +18,7 @@ package org.apache.beam.runners.dataflow.worker.windmill.work.refresh; import static com.google.common.truth.Truth.assertThat; +import static org.junit.Assert.*; import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.eq; import static org.mockito.Mockito.doAnswer; @@ -194,10 +195,13 @@ public void testActiveWorkRefresh() throws InterruptedException { assertThat(heartbeatRequests) .comparingElementsUsing( Correspondence.from( - (Windmill.HeartbeatRequest h, Work w) -> - h.getWorkToken() == w.getWorkItem().getWorkToken() - && h.getCacheToken() == w.getWorkItem().getWorkToken() - && h.getShardingKey() == w.getWorkItem().getShardingKey(), + (Windmill.HeartbeatRequest h, Work w) -> { + assert h != null; + assert w != null; + return h.getWorkToken() == w.getWorkItem().getWorkToken() + && h.getCacheToken() == w.getWorkItem().getWorkToken() + && h.getShardingKey() == w.getWorkItem().getShardingKey(); + }, "heartbeatRequest's and Work's workTokens, cacheTokens, and shardingKeys should be equal.")) .containsExactlyElementsIn(work); } @@ -207,6 +211,32 @@ public void testActiveWorkRefresh() throws InterruptedException { workIsProcessed.countDown(); } + @Test + public void testEmptyActiveWorkRefresh() throws InterruptedException { + int activeWorkRefreshPeriodMillis = 100; + + List computations = new ArrayList<>(); + for (int i = 0; i < 5; i++) { + ComputationState computationState = createComputationState(i); + computations.add(computationState); + } + + CountDownLatch heartbeatsSent = new CountDownLatch(1); + TestClock fakeClock = new TestClock(Instant.now()); + ActiveWorkRefresher activeWorkRefresher = + createActiveWorkRefresher( + fakeClock::now, + activeWorkRefreshPeriodMillis, + 0, + () -> computations, + heartbeats -> heartbeatsSent::countDown); + + activeWorkRefresher.start(); + fakeClock.advance(Duration.millis(activeWorkRefreshPeriodMillis * 2)); + assertFalse(heartbeatsSent.await(500, TimeUnit.MILLISECONDS)); + activeWorkRefresher.stop(); + } + @Test public void testInvalidateStuckCommits() throws InterruptedException { int stuckCommitDurationMillis = 100; From 80ae93217c5ac74e41cbedaeea7806fb0f05c2a9 Mon Sep 17 00:00:00 2001 From: Robert Bradshaw Date: Mon, 5 Aug 2024 09:49:55 -0700 Subject: [PATCH 10/78] Minor optimization for the common case of merging empty string sets. (#31803) --- .../runners/core/metrics/StringSetData.java | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/runners/core-java/src/main/java/org/apache/beam/runners/core/metrics/StringSetData.java b/runners/core-java/src/main/java/org/apache/beam/runners/core/metrics/StringSetData.java index 93dfb8e3ebc80..466d4ad46eb6f 100644 --- a/runners/core-java/src/main/java/org/apache/beam/runners/core/metrics/StringSetData.java +++ b/runners/core-java/src/main/java/org/apache/beam/runners/core/metrics/StringSetData.java @@ -19,7 +19,6 @@ import com.google.auto.value.AutoValue; import java.io.Serializable; -import java.util.HashSet; import java.util.Set; import java.util.stream.Collectors; import java.util.stream.StreamSupport; @@ -50,12 +49,16 @@ public static StringSetData empty() { * Combines this {@link StringSetData} with other, both original StringSetData are left intact. */ public StringSetData combine(StringSetData other) { - // do not merge other on this as this StringSetData might hold an immutable set like in case - // of EmptyStringSetData - Set combined = new HashSet<>(); - combined.addAll(this.stringSet()); - combined.addAll(other.stringSet()); - return StringSetData.create(combined); + if (this.stringSet().isEmpty()) { + return other; + } else if (other.stringSet().isEmpty()) { + return this; + } else { + ImmutableSet.Builder combined = ImmutableSet.builder(); + combined.addAll(this.stringSet()); + combined.addAll(other.stringSet()); + return StringSetData.create(combined.build()); + } } /** From 5b2bfe96f83a5631c3a8d5c3b92a0f695ffe2d7d Mon Sep 17 00:00:00 2001 From: Damon Date: Mon, 5 Aug 2024 10:25:37 -0700 Subject: [PATCH 11/78] [Prism] Enable an artifact resolver for the Prism runner (#32058) * Enable an ArtifactResolver for the Prism runner * Rename class * spotlessApply * Fix Builder instantiation --- .../runners/prism/PrismArtifactResolver.java | 110 ++++++++++++++++++ .../prism/PrismArtifactResolverTest.java | 45 +++++++ 2 files changed, 155 insertions(+) create mode 100644 runners/prism/java/src/main/java/org/apache/beam/runners/prism/PrismArtifactResolver.java create mode 100644 runners/prism/java/src/test/java/org/apache/beam/runners/prism/PrismArtifactResolverTest.java diff --git a/runners/prism/java/src/main/java/org/apache/beam/runners/prism/PrismArtifactResolver.java b/runners/prism/java/src/main/java/org/apache/beam/runners/prism/PrismArtifactResolver.java new file mode 100644 index 0000000000000..db56bc6047ca7 --- /dev/null +++ b/runners/prism/java/src/main/java/org/apache/beam/runners/prism/PrismArtifactResolver.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.prism; + +import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkState; + +import com.google.auto.value.AutoValue; +import java.util.Optional; +import org.apache.beam.model.pipeline.v1.RunnerApi; +import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.options.PipelineOptions; +import org.apache.beam.sdk.util.construction.DefaultArtifactResolver; +import org.apache.beam.sdk.util.construction.PipelineTranslation; +import org.apache.beam.sdk.util.construction.SdkComponents; + +/** + * The {@link PrismArtifactResolver} converts a {@link Pipeline} to a {@link RunnerApi.Pipeline} via + * resolving {@link RunnerApi.ArtifactInformation}. + */ +@AutoValue +abstract class PrismArtifactResolver { + + /** + * Instantiates a {@link PrismArtifactResolver} from the {@param pipeline}, applying defaults to + * the remaining dependencies. + */ + static PrismArtifactResolver of(Pipeline pipeline) { + return PrismArtifactResolver.builder().setPipeline(pipeline).build(); + } + + static Builder builder() { + return new AutoValue_PrismArtifactResolver.Builder(); + } + + /** + * Converts the {@link #getPipeline()} using {@link PipelineTranslation#toProto} and {@link + * #getDelegate()}'s {@link + * org.apache.beam.sdk.util.construction.ArtifactResolver#resolveArtifacts}. + */ + RunnerApi.Pipeline resolvePipelineProto() { + RunnerApi.Pipeline result = PipelineTranslation.toProto(getPipeline(), getSdkComponents()); + return getDelegate().resolveArtifacts(result); + } + + /** + * {@link PrismArtifactResolver} delegates to {@link + * org.apache.beam.sdk.util.construction.ArtifactResolver} to transform {@link + * RunnerApi.ArtifactInformation}. Defaults to {@link DefaultArtifactResolver#INSTANCE} if not + * set. + */ + abstract org.apache.beam.sdk.util.construction.ArtifactResolver getDelegate(); + + /** The {@link Pipeline} from which {@link PrismArtifactResolver#resolvePipelineProto()}. */ + abstract Pipeline getPipeline(); + + /** + * SDK objects that will be represented by {@link + * org.apache.beam.model.pipeline.v1.RunnerApi.Components}. Instantiated via {@link + * SdkComponents#create(PipelineOptions)} by default, where {@link PipelineOptions} are acquired + * from {@link #getPipeline}'s {@link Pipeline#getOptions}. + */ + abstract SdkComponents getSdkComponents(); + + @AutoValue.Builder + abstract static class Builder { + + abstract Builder setDelegate( + org.apache.beam.sdk.util.construction.ArtifactResolver artifactResolver); + + abstract Optional getDelegate(); + + abstract Builder setSdkComponents(SdkComponents sdkComponents); + + abstract Optional getSdkComponents(); + + abstract Builder setPipeline(Pipeline pipeline); + + abstract Optional getPipeline(); + + abstract PrismArtifactResolver autoBuild(); + + final PrismArtifactResolver build() { + if (!getDelegate().isPresent()) { + setDelegate(DefaultArtifactResolver.INSTANCE); + } + + if (!getSdkComponents().isPresent()) { + checkState(getPipeline().isPresent()); + setSdkComponents(SdkComponents.create(getPipeline().get().getOptions())); + } + + return autoBuild(); + } + } +} diff --git a/runners/prism/java/src/test/java/org/apache/beam/runners/prism/PrismArtifactResolverTest.java b/runners/prism/java/src/test/java/org/apache/beam/runners/prism/PrismArtifactResolverTest.java new file mode 100644 index 0000000000000..ef4646f023477 --- /dev/null +++ b/runners/prism/java/src/test/java/org/apache/beam/runners/prism/PrismArtifactResolverTest.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.prism; + +import static com.google.common.truth.Truth.assertThat; + +import org.apache.beam.model.pipeline.v1.RunnerApi; +import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.transforms.Impulse; +import org.apache.beam.sdk.util.construction.BeamUrns; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +/** Tests for {@link PrismArtifactResolver}. */ +@RunWith(JUnit4.class) +public class PrismArtifactResolverTest { + @Test + public void resolvesPipeline() { + Pipeline pipeline = Pipeline.create(); + pipeline.apply(Impulse.create()); + PrismArtifactResolver underTest = PrismArtifactResolver.of(pipeline); + RunnerApi.Pipeline pipelineProto = underTest.resolvePipelineProto(); + RunnerApi.Components components = pipelineProto.getComponents(); + assertThat(components.getTransformsMap()).containsKey("Impulse"); + assertThat(components.getCodersMap()).containsKey("ByteArrayCoder"); + assertThat(components.getEnvironmentsMap()) + .containsKey(BeamUrns.getUrn(RunnerApi.StandardEnvironments.Environments.DOCKER)); + } +} From fb49e9644a4b81bdca339d98181c6f21256d474a Mon Sep 17 00:00:00 2001 From: Yi Hu Date: Mon, 5 Aug 2024 18:00:08 -0400 Subject: [PATCH 12/78] Fix load test dataproc cluster name exceeded allowed length (#32062) --- .github/workflows/beam_LoadTests_Python_CoGBK_Flink_Batch.yml | 2 +- .github/workflows/beam_LoadTests_Python_ParDo_Flink_Batch.yml | 2 +- .../workflows/beam_LoadTests_Python_ParDo_Flink_Streaming.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/beam_LoadTests_Python_CoGBK_Flink_Batch.yml b/.github/workflows/beam_LoadTests_Python_CoGBK_Flink_Batch.yml index 5ded71a7652a1..e2afb2e2cfd70 100644 --- a/.github/workflows/beam_LoadTests_Python_CoGBK_Flink_Batch.yml +++ b/.github/workflows/beam_LoadTests_Python_CoGBK_Flink_Batch.yml @@ -48,7 +48,7 @@ env: INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} GCLOUD_ZONE: us-central1-a - CLUSTER_NAME: beam-loadtests-python-cogbk-flink-batch-${{ github.run_id }} + CLUSTER_NAME: beam-loadtests-py-cogbk-flink-batch-${{ github.run_id }} GCS_BUCKET: gs://beam-flink-cluster FLINK_DOWNLOAD_URL: https://archive.apache.org/dist/flink/flink-1.15.0/flink-1.15.0-bin-scala_2.12.tgz HADOOP_DOWNLOAD_URL: https://repo.maven.apache.org/maven2/org/apache/flink/flink-shaded-hadoop-2-uber/2.8.3-10.0/flink-shaded-hadoop-2-uber-2.8.3-10.0.jar diff --git a/.github/workflows/beam_LoadTests_Python_ParDo_Flink_Batch.yml b/.github/workflows/beam_LoadTests_Python_ParDo_Flink_Batch.yml index 061a1b9e210ed..bae2f9f82ee1f 100644 --- a/.github/workflows/beam_LoadTests_Python_ParDo_Flink_Batch.yml +++ b/.github/workflows/beam_LoadTests_Python_ParDo_Flink_Batch.yml @@ -48,7 +48,7 @@ env: INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} GCLOUD_ZONE: us-central1-a - CLUSTER_NAME: beam-loadtests-python-pardo-flink-batch-${{ github.run_id }} + CLUSTER_NAME: beam-loadtests-py-pardo-flink-batch-${{ github.run_id }} GCS_BUCKET: gs://beam-flink-cluster FLINK_DOWNLOAD_URL: https://archive.apache.org/dist/flink/flink-1.15.0/flink-1.15.0-bin-scala_2.12.tgz HADOOP_DOWNLOAD_URL: https://repo.maven.apache.org/maven2/org/apache/flink/flink-shaded-hadoop-2-uber/2.8.3-10.0/flink-shaded-hadoop-2-uber-2.8.3-10.0.jar diff --git a/.github/workflows/beam_LoadTests_Python_ParDo_Flink_Streaming.yml b/.github/workflows/beam_LoadTests_Python_ParDo_Flink_Streaming.yml index bec926ab9656c..4485b7187f800 100644 --- a/.github/workflows/beam_LoadTests_Python_ParDo_Flink_Streaming.yml +++ b/.github/workflows/beam_LoadTests_Python_ParDo_Flink_Streaming.yml @@ -48,7 +48,7 @@ env: INFLUXDB_USER: ${{ secrets.INFLUXDB_USER }} INFLUXDB_USER_PASSWORD: ${{ secrets.INFLUXDB_USER_PASSWORD }} GCLOUD_ZONE: us-central1-a - CLUSTER_NAME: beam-loadtests-python-pardo-flink-stream-${{ github.run_id }} + CLUSTER_NAME: beam-loadtests-py-pardo-flink-stream-${{ github.run_id }} GCS_BUCKET: gs://beam-flink-cluster FLINK_DOWNLOAD_URL: https://archive.apache.org/dist/flink/flink-1.15.0/flink-1.15.0-bin-scala_2.12.tgz HADOOP_DOWNLOAD_URL: https://repo.maven.apache.org/maven2/org/apache/flink/flink-shaded-hadoop-2-uber/2.8.3-10.0/flink-shaded-hadoop-2-uber-2.8.3-10.0.jar From c60623524ae9998cdfb8bfb1985f218e7dfa823a Mon Sep 17 00:00:00 2001 From: Jack McCluskey <34928439+jrmccluskey@users.noreply.github.com> Date: Tue, 6 Aug 2024 10:51:42 -0400 Subject: [PATCH 13/78] Beam Website Updates for 2.58.0 Release (#31925) * Beam Website Updates for 2.58.0 Release * Update 2.57.0 links to archive links * Apply suggestions from code review Co-authored-by: Rebecca Szper <98840847+rszper@users.noreply.github.com> * move solace io to highlights * Add SpannerIO breaking change * fix urls * Update CHANGES.md Co-authored-by: tvalentyn * Apply suggestions from code review Co-authored-by: Rebecca Szper <98840847+rszper@users.noreply.github.com> * Update release date * add release date to changes.md --------- Co-authored-by: Rebecca Szper <98840847+rszper@users.noreply.github.com> Co-authored-by: tvalentyn --- CHANGES.md | 21 +-- website/www/site/config.toml | 2 +- .../www/site/content/en/blog/beam-2.58.0.md | 130 ++++++++++++++++++ .../site/content/en/get-started/downloads.md | 13 +- 4 files changed, 143 insertions(+), 23 deletions(-) create mode 100644 website/www/site/content/en/blog/beam-2.58.0.md diff --git a/CHANGES.md b/CHANGES.md index b127599ae0aa8..7f12b53342602 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -88,15 +88,10 @@ * ([#X](https://github.com/apache/beam/issues/X)). -# [2.58.0] - Unreleased +# [2.58.0] - 2024-08-06 ## Highlights -* New highly anticipated feature X added to Python SDK ([#X](https://github.com/apache/beam/issues/X)). -* New highly anticipated feature Y added to Java SDK ([#Y](https://github.com/apache/beam/issues/Y)). - -## I/Os - * Support for [Solace](https://solace.com/) source (`SolaceIO.Read`) added (Java) ([#31440](https://github.com/apache/beam/issues/31440)). ## New Features / Improvements @@ -110,25 +105,13 @@ ## Breaking Changes -* X behavior was changed ([#X](https://github.com/apache/beam/issues/X)). * [IcebergIO] IcebergCatalogConfig was changed to support specifying catalog properties in a key-store fashion ([#31726](https://github.com/apache/beam/pull/31726)) * [SpannerIO] Added validation that query and table cannot be specified at the same time for SpannerIO.read(). Previously withQuery overrides withTable, if set ([#24956](https://github.com/apache/beam/issues/24956)). -## Deprecations - -* X behavior is deprecated and will be removed in X versions ([#X](https://github.com/apache/beam/issues/X)). - ## Bugfixes * [BigQueryIO] Fixed a bug in batch Storage Write API that frequently exhausted concurrent connections quota ([#31710](https://github.com/apache/beam/pull/31710)) -* Fixed X (Java/Python) ([#X](https://github.com/apache/beam/issues/X)). - -## Security Fixes -* Fixed (CVE-YYYY-NNNN)[https://www.cve.org/CVERecord?id=CVE-YYYY-NNNN] (Java/Python/Go) ([#X](https://github.com/apache/beam/issues/X)). - -## Known Issues - -* ([#X](https://github.com/apache/beam/issues/X)). +* Fixed a logging issue where Python worker dependency installation logs sometimes were not emitted in a timely manner ([#31977](https://github.com/apache/beam/pull/31977)) # [2.57.0] - 2024-06-26 diff --git a/website/www/site/config.toml b/website/www/site/config.toml index 7fe6df7a2c7a2..6675cf418bdd9 100644 --- a/website/www/site/config.toml +++ b/website/www/site/config.toml @@ -104,7 +104,7 @@ github_project_repo = "https://github.com/apache/beam" [params] description = "Apache Beam is an open source, unified model and set of language-specific SDKs for defining and executing data processing workflows, and also data ingestion and integration flows, supporting Enterprise Integration Patterns (EIPs) and Domain Specific Languages (DSLs). Dataflow pipelines simplify the mechanics of large-scale batch and streaming data processing and can run on a number of runtimes like Apache Flink, Apache Spark, and Google Cloud Dataflow (a cloud service). Beam also brings DSL in different languages, allowing users to easily implement their data integration processes." -release_latest = "2.57.0" +release_latest = "2.58.0" # The repository and branch where the files live in Github or Colab. This is used # to serve and stage from your local branch, but publish to the master branch. # e.g. https://github.com/{{< param branch_repo >}}/path/to/notebook.ipynb diff --git a/website/www/site/content/en/blog/beam-2.58.0.md b/website/www/site/content/en/blog/beam-2.58.0.md new file mode 100644 index 0000000000000..603403cd7fdbc --- /dev/null +++ b/website/www/site/content/en/blog/beam-2.58.0.md @@ -0,0 +1,130 @@ +--- +title: "Apache Beam 2.58.0" +date: 2024-08-06 13:00:00 -0800 +categories: + - blog + - release +authors: + - jrmccluskey +--- + + +We are happy to present the new 2.58.0 release of Beam. +This release includes both improvements and new functionality. +See the [download page](/get-started/downloads/#2580-2024-08-06) for this release. + + + +For more information about changes in 2.58.0, check out the [detailed release notes](https://github.com/apache/beam/milestone/22). + +## I/Os + +* Support for [Solace](https://solace.com/) source (`SolaceIO.Read`) added (Java) ([#31440](https://github.com/apache/beam/issues/31440)). + +## New Features / Improvements + +* Multiple RunInference instances can now share the same model instance by setting the model_identifier parameter (Python) ([#31665](https://github.com/apache/beam/issues/31665)). +* Added options to control the number of Storage API multiplexing connections ([#31721](https://github.com/apache/beam/pull/31721)) +* [BigQueryIO] Better handling for batch Storage Write API when it hits AppendRows throughput quota ([#31837](https://github.com/apache/beam/pull/31837)) +* [IcebergIO] All specified catalog properties are passed through to the connector ([#31726](https://github.com/apache/beam/pull/31726)) +* Removed a third-party LGPL dependency from the Go SDK ([#31765](https://github.com/apache/beam/issues/31765)). +* Support for `MapState` and `SetState` when using Dataflow Runner v1 with Streaming Engine (Java) ([[#18200](https://github.com/apache/beam/issues/18200)]) + +## Breaking Changes + +* [IcebergIO] `IcebergCatalogConfig` was changed to support specifying catalog properties in a key-store fashion ([#31726](https://github.com/apache/beam/pull/31726)) +* [SpannerIO] Added validation that query and table cannot be specified at the same time for `SpannerIO.read()`. Previously `withQuery` overrides `withTable`, if set ([#24956](https://github.com/apache/beam/issues/24956)). + +## Bug fixes + +* [BigQueryIO] Fixed a bug in batch Storage Write API that frequently exhausted concurrent connections quota ([#31710](https://github.com/apache/beam/pull/31710)) + +## List of Contributors + +According to git shortlog, the following people contributed to the 2.58.0 release. Thank you to all contributors! + +Ahmed Abualsaud + +Ahmet Altay + +Alexandre Moueddene + +Alexey Romanenko + +Andrew Crites + +Bartosz Zablocki + +Celeste Zeng + +Chamikara Jayalath + +Clay Johnson + +Damon Douglass + +Danny McCormick + +Dilnaz Amanzholova + +Florian Bernard + +Francis O'Hara + +George Ma + +Israel Herraiz + +Jack McCluskey + +Jaehyeon Kim + +James Roseman + +Kenneth Knowles + +Maciej Szwaja + +Michel Davit + +Minh Son Nguyen + +Naireen + +Niel Markwick + +Oliver Cardoza + +Robert Bradshaw + +Robert Burke + +Rohit Sinha + +S. Veyrié + +Sam Whittle + +Shunping Huang + +Svetak Sundhar + +TongruiLi + +Tony Tang + +Valentyn Tymofieiev + +Vitaly Terentyev + +Yi Hu \ No newline at end of file diff --git a/website/www/site/content/en/get-started/downloads.md b/website/www/site/content/en/get-started/downloads.md index 8f3b92ef9f2a0..b7db1ddd65b6f 100644 --- a/website/www/site/content/en/get-started/downloads.md +++ b/website/www/site/content/en/get-started/downloads.md @@ -96,10 +96,17 @@ versions denoted `0.x.y`. ## Releases +### 2.58.0 (2024-08-06) +Official [source code download](https://downloads.apache.org/beam/2.58.0/apache-beam-2.58.0-source-release.zip). +[SHA-512](https://downloads.apache.org/beam/2.58.0/apache-beam-2.58.0-source-release.zip.sha512). +[signature](https://downloads.apache.org/beam/2.58.0/apache-beam-2.58.0-source-release.zip.asc). + +[Release notes](https://github.com/apache/beam/releases/tag/v2.58.0) + ### 2.57.0 (2024-06-26) -Official [source code download](https://downloads.apache.org/beam/2.57.0/apache-beam-2.57.0-source-release.zip). -[SHA-512](https://downloads.apache.org/beam/2.57.0/apache-beam-2.57.0-source-release.zip.sha512). -[signature](https://downloads.apache.org/beam/2.57.0/apache-beam-2.57.0-source-release.zip.asc). +Official [source code download](https://archive.apache.org/beam/2.57.0/apache-beam-2.57.0-source-release.zip). +[SHA-512](https://archive.apache.org/beam/2.57.0/apache-beam-2.57.0-source-release.zip.sha512). +[signature](https://archive.apache.org/beam/2.57.0/apache-beam-2.57.0-source-release.zip.asc). [Release notes](https://github.com/apache/beam/releases/tag/v2.57.0) From d09c3237c8aa1ed48351046fe61bc0cc8794521a Mon Sep 17 00:00:00 2001 From: atask-g Date: Tue, 6 Aug 2024 11:53:36 -0400 Subject: [PATCH 14/78] Added support for the TOKENLIST type in Spanner (#32038) --- .../org/apache/beam/sdk/io/gcp/spanner/SpannerSchema.java | 3 +++ .../org/apache/beam/sdk/io/gcp/spanner/SpannerSchemaTest.java | 4 +++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/SpannerSchema.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/SpannerSchema.java index 3fd09c63da794..fa44cadeba0a4 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/SpannerSchema.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/SpannerSchema.java @@ -184,6 +184,9 @@ private static Type parseSpannerType(String spannerType, Dialect dialect) { if (spannerType.startsWith("BYTES")) { return Type.bytes(); } + if ("TOKENLIST".equals(spannerType)) { + return Type.bytes(); + } if ("TIMESTAMP".equals(spannerType)) { return Type.timestamp(); } diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/SpannerSchemaTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/SpannerSchemaTest.java index 166df1704ca8a..1e89326d1e8c9 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/SpannerSchemaTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/SpannerSchemaTest.java @@ -40,16 +40,18 @@ public void testSingleTable() throws Exception { .addColumn("test", "jsonVal", "JSON") .addColumn("test", "protoVal", "PROTO") .addColumn("test", "enumVal", "ENUM") + .addColumn("test", "tokens", "TOKENLIST") .build(); assertEquals(1, schema.getTables().size()); - assertEquals(6, schema.getColumns("test").size()); + assertEquals(7, schema.getColumns("test").size()); assertEquals(1, schema.getKeyParts("test").size()); assertEquals(Type.json(), schema.getColumns("test").get(3).getType()); assertEquals( Type.proto("customer.app.TestMessage"), schema.getColumns("test").get(4).getType()); assertEquals( Type.protoEnum("customer.app.TestEnum"), schema.getColumns("test").get(5).getType()); + assertEquals(Type.bytes(), schema.getColumns("test").get(6).getType()); } @Test From e9b5dc69532865e4ec20faa13a1ff88552bc50ae Mon Sep 17 00:00:00 2001 From: Yi Hu Date: Tue, 6 Aug 2024 12:39:03 -0400 Subject: [PATCH 15/78] Enforce java.nio.charset.StandardCharsets against guava Charsets (#32083) * Enforce java.nio.charset.StandardCharsets against guava Charsets * Fix dependency analyzeClassesDependencies --- .../transforms/FormatTransform.java | 5 ++-- .../streaming/io/StreamingImpulseSource.java | 5 ++-- .../flink/FlinkJobServerDriverTest.java | 12 ++++++---- ...FlinkPipelineExecutionEnvironmentTest.java | 4 ++-- .../runners/flink/FlinkSubmissionTest.java | 4 ++-- .../ExecutableStageDoFnOperatorTest.java | 9 ++++--- .../wrappers/streaming/FlinkKeyUtilsTest.java | 4 ++-- .../dataflow/DataflowPipelineTranslator.java | 7 +++--- .../runners/dataflow/worker/graph/Nodes.java | 6 ++--- .../GroupingShuffleEntryIteratorTest.java | 7 +++--- .../state/WindmillStateInternalsTest.java | 11 +++++---- .../state/WindmillStateReaderTest.java | 8 +++---- .../artifact/ArtifactStagingService.java | 4 ++-- .../state/StateRequestHandlers.java | 5 ++-- .../ArtifactRetrievalServiceTest.java | 4 ++-- .../testing/TestUniversalRunner.java | 5 ++-- .../resources/beam/checkstyle/checkstyle.xml | 8 +++++++ .../providers/LoggingTransformProvider.java | 4 ++-- .../transforms/errorhandling/BadRecord.java | 6 ++--- .../ByteBuddyOnTimerInvokerFactory.java | 6 ++--- .../resourcehints/ResourceHints.java | 8 +++---- .../sdk/coders/StructuralByteArrayTest.java | 10 ++++---- .../apache/beam/sdk/io/FileBasedSinkTest.java | 2 +- .../org/apache/beam/sdk/io/FileIOTest.java | 6 ++--- .../apache/beam/sdk/io/TFRecordIOTest.java | 7 +++--- .../apache/beam/sdk/io/TextIOReadTest.java | 8 +++---- .../apache/beam/sdk/io/TextIOWriteTest.java | 6 ++--- .../sdk/io/TextRowCountEstimatorTest.java | 10 ++++---- .../apache/beam/sdk/io/WriteFilesTest.java | 5 ++-- .../options/PipelineOptionsFactoryTest.java | 24 +++++++++---------- .../sdk/schemas/SchemaTranslationTest.java | 5 ++-- ...fferedElementCountingOutputStreamTest.java | 6 ++--- .../util/ExposedByteArrayInputStreamTest.java | 6 ++--- .../ExposedByteArrayOutputStreamTest.java | 4 ++-- .../beam/sdk/util/SerializableUtilsTest.java | 4 ++-- .../service/ExpansionServiceTest.java | 4 ++-- .../avro/AvroGenericCoderTranslator.java | 6 ++--- .../sdk/extensions/avro/io/AvroIOTest.java | 6 ++--- .../python/PythonExternalTransform.java | 5 ++-- .../sdk/extensions/python/PythonService.java | 5 ++-- sdks/java/extensions/sql/jdbc/build.gradle | 2 +- .../sdk/extensions/sql/jdbc/BeamSqlLine.java | 6 ++--- .../provider/text/TextTableProviderTest.java | 20 +++++++++------- .../sdk/io/clickhouse/ClickHouseWriter.java | 4 ++-- .../ContextualTextIOTest.java | 16 ++++++------- ...PubsubReadSchemaTransformProviderTest.java | 6 ++--- .../sdk/io/kafka/ReadFromKafkaDoFnTest.java | 6 ++--- .../apache/beam/sdk/tpcds/QueryReader.java | 4 ++-- .../beam/sdk/tpcds/SqlTransformRunner.java | 4 ++-- .../beam/sdk/tpcds/TableSchemaJSONLoader.java | 4 ++-- .../TransformServiceLauncherTest.java | 10 ++++---- 51 files changed, 182 insertions(+), 161 deletions(-) diff --git a/examples/java/src/main/java/org/apache/beam/examples/complete/kafkatopubsub/transforms/FormatTransform.java b/examples/java/src/main/java/org/apache/beam/examples/complete/kafkatopubsub/transforms/FormatTransform.java index 2d9089fcd29af..296d7e7d2409b 100644 --- a/examples/java/src/main/java/org/apache/beam/examples/complete/kafkatopubsub/transforms/FormatTransform.java +++ b/examples/java/src/main/java/org/apache/beam/examples/complete/kafkatopubsub/transforms/FormatTransform.java @@ -17,6 +17,7 @@ */ package org.apache.beam.examples.complete.kafkatopubsub.transforms; +import java.nio.charset.StandardCharsets; import java.util.List; import java.util.Map; import org.apache.beam.examples.complete.kafkatopubsub.avro.AvroDataClass; @@ -37,7 +38,6 @@ import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.PDone; import org.apache.beam.sdk.values.TypeDescriptor; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.apache.kafka.common.serialization.StringDeserializer; @@ -120,7 +120,8 @@ public PDone expand(PCollection input) { MapElements.into(TypeDescriptor.of(PubsubMessage.class)) .via( (String json) -> - new PubsubMessage(json.getBytes(Charsets.UTF_8), ImmutableMap.of()))) + new PubsubMessage( + json.getBytes(StandardCharsets.UTF_8), ImmutableMap.of()))) .apply( "writePubsubMessagesToPubSub", PubsubIO.writeMessages().to(options.getOutputTopic())); } diff --git a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/StreamingImpulseSource.java b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/StreamingImpulseSource.java index 8f21e42d61e66..871d7a5a39895 100644 --- a/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/StreamingImpulseSource.java +++ b/runners/flink/src/main/java/org/apache/beam/runners/flink/translation/wrappers/streaming/io/StreamingImpulseSource.java @@ -17,8 +17,8 @@ */ package org.apache.beam.runners.flink.translation.wrappers.streaming.io; +import java.nio.charset.StandardCharsets; import org.apache.beam.sdk.util.WindowedValue; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.flink.streaming.api.functions.source.RichParallelSourceFunction; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -60,7 +60,8 @@ public void run(SourceContext> ctx) { while (running && (messageCount == 0 || count < subtaskCount)) { synchronized (ctx.getCheckpointLock()) { ctx.collect( - WindowedValue.valueInGlobalWindow(String.valueOf(count).getBytes(Charsets.UTF_8))); + WindowedValue.valueInGlobalWindow( + String.valueOf(count).getBytes(StandardCharsets.UTF_8))); count++; } diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkJobServerDriverTest.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkJobServerDriverTest.java index 4a628eeb4fdf0..22516cbc96331 100644 --- a/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkJobServerDriverTest.java +++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkJobServerDriverTest.java @@ -25,7 +25,7 @@ import java.io.ByteArrayOutputStream; import java.io.PrintStream; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; +import java.nio.charset.StandardCharsets; import org.junit.Test; /** Tests for {@link FlinkJobServerDriver}. */ @@ -104,7 +104,7 @@ public void testJobServerDriver() throws Exception { boolean success = false; while (!success) { newErr.flush(); - String output = baos.toString(Charsets.UTF_8.name()); + String output = baos.toString(StandardCharsets.UTF_8.name()); if (output.contains("JobService started on localhost:") && output.contains("ArtifactStagingService started on localhost:") && output.contains("ExpansionService started on localhost:")) { @@ -114,7 +114,8 @@ public void testJobServerDriver() throws Exception { } } assertThat(driver.getJobServerUrl(), is(not(nullValue()))); - assertThat(baos.toString(Charsets.UTF_8.name()), containsString(driver.getJobServerUrl())); + assertThat( + baos.toString(StandardCharsets.UTF_8.name()), containsString(driver.getJobServerUrl())); assertThat(driverThread.isAlive(), is(true)); } catch (Throwable t) { // restore to print exception @@ -149,7 +150,7 @@ public void testJobServerDriverWithoutExpansionService() throws Exception { boolean success = false; while (!success) { newErr.flush(); - String output = baos.toString(Charsets.UTF_8.name()); + String output = baos.toString(StandardCharsets.UTF_8.name()); if (output.contains("JobService started on localhost:") && output.contains("ArtifactStagingService started on localhost:")) { success = true; @@ -161,7 +162,8 @@ public void testJobServerDriverWithoutExpansionService() throws Exception { } } assertThat(driver.getJobServerUrl(), is(not(nullValue()))); - assertThat(baos.toString(Charsets.UTF_8.name()), containsString(driver.getJobServerUrl())); + assertThat( + baos.toString(StandardCharsets.UTF_8.name()), containsString(driver.getJobServerUrl())); assertThat(driverThread.isAlive(), is(true)); } catch (Throwable t) { // restore to print exception diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkPipelineExecutionEnvironmentTest.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkPipelineExecutionEnvironmentTest.java index 9d898ed53a896..3b92c282c38a7 100644 --- a/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkPipelineExecutionEnvironmentTest.java +++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkPipelineExecutionEnvironmentTest.java @@ -38,6 +38,7 @@ import java.lang.reflect.Method; import java.net.MalformedURLException; import java.net.URL; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; @@ -56,7 +57,6 @@ import org.apache.beam.sdk.util.construction.PTransformMatchers; import org.apache.beam.sdk.util.construction.PTransformTranslation; import org.apache.beam.sdk.util.construction.resources.PipelineResources; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.apache.flink.api.java.ExecutionEnvironment; import org.apache.flink.api.java.RemoteEnvironment; @@ -373,7 +373,7 @@ public void processElement(ProcessContext ctx) { } replacementStdErr.flush(); assertThat( - new String(byteArrayOutputStream.toByteArray(), Charsets.UTF_8), + new String(byteArrayOutputStream.toByteArray(), StandardCharsets.UTF_8), containsString( "UnboundedSources present which rely on checkpointing, but checkpointing is disabled.")); } diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkSubmissionTest.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkSubmissionTest.java index 22a9ce4f39ab6..cf860717def37 100644 --- a/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkSubmissionTest.java +++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/FlinkSubmissionTest.java @@ -20,6 +20,7 @@ import java.io.File; import java.lang.reflect.Field; import java.lang.reflect.Modifier; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.security.Permission; import java.util.Collection; @@ -30,7 +31,6 @@ import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.util.construction.resources.PipelineResources; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; @@ -200,7 +200,7 @@ private static void prepareEnvironment() throws Exception { RestOptions.PORT.key(), flinkCluster.getRestPort()); - Files.write(file.toPath(), config.getBytes(Charsets.UTF_8)); + Files.write(file.toPath(), config.getBytes(StandardCharsets.UTF_8)); // Create a new environment with the location of the Flink config for CliFrontend ImmutableMap newEnv = diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/ExecutableStageDoFnOperatorTest.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/ExecutableStageDoFnOperatorTest.java index cf5b2b555124b..2eb0545b77940 100644 --- a/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/ExecutableStageDoFnOperatorTest.java +++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/ExecutableStageDoFnOperatorTest.java @@ -103,7 +103,6 @@ import org.apache.beam.sdk.values.WindowingStrategy; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.Struct; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; @@ -818,7 +817,7 @@ private void testEnsureDeferredStateCleanupTimerFiring(boolean withCheckpointing BagState state = // State from the SDK Harness is stored as ByteStrings operator.keyedStateInternals.state( stateNamespace, StateTags.bag(stateId, ByteStringCoder.of())); - state.add(ByteString.copyFrom("userstate".getBytes(Charsets.UTF_8))); + state.add(ByteString.copyFrom("userstate".getBytes(StandardCharsets.UTF_8))); assertThat(testHarness.numKeyedStateEntries(), is(1)); // user timer that fires after the end of the window and after state cleanup @@ -966,7 +965,7 @@ public void testEnsureStateCleanupOnFinalWatermark() throws Exception { BagState state = // State from the SDK Harness is stored as ByteStrings operator.keyedStateInternals.state( stateNamespace, StateTags.bag(stateId, ByteStringCoder.of())); - state.add(ByteString.copyFrom("userstate".getBytes(Charsets.UTF_8))); + state.add(ByteString.copyFrom("userstate".getBytes(StandardCharsets.UTF_8))); // No timers have been set for cleanup assertThat(testHarness.numEventTimeTimers(), is(0)); // State has been created @@ -988,8 +987,8 @@ public void testCacheTokenHandling() throws Exception { new ExecutableStageDoFnOperator.BagUserStateFactory<>( test, stateBackend, NoopLock.get(), null); - ByteString key1 = ByteString.copyFrom("key1", Charsets.UTF_8); - ByteString key2 = ByteString.copyFrom("key2", Charsets.UTF_8); + ByteString key1 = ByteString.copyFrom("key1", StandardCharsets.UTF_8); + ByteString key2 = ByteString.copyFrom("key2", StandardCharsets.UTF_8); Map> userStateMapMock = Mockito.mock(Map.class); diff --git a/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/FlinkKeyUtilsTest.java b/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/FlinkKeyUtilsTest.java index cab45632ac552..cdf461b5fde83 100644 --- a/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/FlinkKeyUtilsTest.java +++ b/runners/flink/src/test/java/org/apache/beam/runners/flink/translation/wrappers/streaming/FlinkKeyUtilsTest.java @@ -22,12 +22,12 @@ import static org.hamcrest.core.Is.is; import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.coders.StringUtf8Coder; import org.apache.beam.sdk.coders.VoidCoder; import org.apache.beam.sdk.util.CoderUtils; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.junit.Test; /** Tests for {@link FlinkKeyUtils}. */ @@ -66,7 +66,7 @@ public void testCoderContext() throws Exception { @Test @SuppressWarnings("ByteBufferBackingArray") public void testFromEncodedKey() { - ByteString input = ByteString.copyFrom("hello world".getBytes(Charsets.UTF_8)); + ByteString input = ByteString.copyFrom("hello world".getBytes(StandardCharsets.UTF_8)); ByteBuffer encodedKey = FlinkKeyUtils.fromEncodedKey(input); assertThat(encodedKey.array(), is(input.toByteArray())); } diff --git a/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowPipelineTranslator.java b/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowPipelineTranslator.java index f905e136e83ba..1fedcd8f3a290 100644 --- a/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowPipelineTranslator.java +++ b/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowPipelineTranslator.java @@ -42,6 +42,7 @@ import com.google.api.services.dataflow.model.Job; import com.google.api.services.dataflow.model.Step; import com.google.api.services.dataflow.model.WorkerPool; +import java.nio.charset.StandardCharsets; import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Collections; @@ -110,7 +111,6 @@ import org.apache.beam.sdk.values.WindowingStrategy; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; import org.apache.commons.codec.EncoderException; import org.apache.commons.codec.net.PercentCodec; @@ -618,7 +618,7 @@ static class StepTranslator implements StepTranslationContext { // For compatibility with URL encoding implementations that represent space as +, // always encode + as %2b even though we don't encode space as +. private final PercentCodec percentCodec = - new PercentCodec("+".getBytes(Charsets.US_ASCII), false); + new PercentCodec("+".getBytes(StandardCharsets.US_ASCII), false); private StepTranslator(Translator translator, Step step) { this.translator = translator; @@ -764,7 +764,8 @@ private void addResourceHints(ResourceHints hints) { try { urlEncodedHints.put( entry.getKey(), - new String(percentCodec.encode(entry.getValue().toBytes()), Charsets.US_ASCII)); + new String( + percentCodec.encode(entry.getValue().toBytes()), StandardCharsets.US_ASCII)); } catch (EncoderException e) { // Should never happen. throw new RuntimeException("Invalid value for resource hint: " + entry.getKey(), e); diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/graph/Nodes.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/graph/Nodes.java index 6092d0d64de5a..d824324170005 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/graph/Nodes.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/graph/Nodes.java @@ -29,11 +29,11 @@ import java.io.IOException; import java.math.BigDecimal; import java.math.BigInteger; +import java.nio.charset.StandardCharsets; import org.apache.beam.runners.dataflow.worker.util.common.worker.Operation; import org.apache.beam.runners.dataflow.worker.util.common.worker.OutputReceiver; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.extensions.gcp.util.Transport; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.MoreObjects; /** Container class for different types of network nodes. All nodes only have reference equality. */ @@ -59,7 +59,7 @@ private static String toStringWithTrimmedLiterals(GenericJson json) { ByteArrayOutputStream byteStream = new ByteArrayOutputStream(); final JsonGenerator baseGenerator = MoreObjects.firstNonNull(json.getFactory(), Transport.getJsonFactory()) - .createJsonGenerator(byteStream, Charsets.UTF_8); + .createJsonGenerator(byteStream, StandardCharsets.UTF_8); JsonGenerator generator = new JsonGenerator() { @Override @@ -164,7 +164,7 @@ public void enablePrettyPrint() throws IOException { generator.enablePrettyPrint(); generator.serialize(json); generator.flush(); - return byteStream.toString(Charsets.UTF_8.name()); + return byteStream.toString(StandardCharsets.UTF_8.name()); } catch (IOException e) { throw new RuntimeException(e); } diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/util/common/worker/GroupingShuffleEntryIteratorTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/util/common/worker/GroupingShuffleEntryIteratorTest.java index 2421d7faf8240..8c6a003cb72bc 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/util/common/worker/GroupingShuffleEntryIteratorTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/util/common/worker/GroupingShuffleEntryIteratorTest.java @@ -42,7 +42,6 @@ import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.util.common.Reiterator; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Lists; import org.checkerframework.checker.nullness.qual.Nullable; import org.junit.After; @@ -130,10 +129,10 @@ private void setCurrentExecutionState(String mockOriginalName) { private static ShuffleEntry shuffleEntry(String key, String value) { return new ShuffleEntry( /* use key itself as position */ - ByteArrayShufflePosition.of(key.getBytes(Charsets.UTF_8)), - ByteString.copyFrom(key.getBytes(Charsets.UTF_8)), + ByteArrayShufflePosition.of(key.getBytes(StandardCharsets.UTF_8)), + ByteString.copyFrom(key.getBytes(StandardCharsets.UTF_8)), ByteString.copyFrom(new byte[0]), - ByteString.copyFrom(value.getBytes(Charsets.UTF_8))); + ByteString.copyFrom(value.getBytes(StandardCharsets.UTF_8))); } @Test diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateInternalsTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateInternalsTest.java index 33e47623cd0ee..d06ed0f526c79 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateInternalsTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateInternalsTest.java @@ -80,7 +80,6 @@ import org.apache.beam.sdk.util.CoderUtils; import org.apache.beam.sdk.values.TimestampedValue; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Supplier; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ArrayListMultimap; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; @@ -225,7 +224,7 @@ public void resetUnderTest() { .forComputation("comp") .forKey( WindmillComputationKey.create( - "comp", ByteString.copyFrom("dummyKey", Charsets.UTF_8), 123), + "comp", ByteString.copyFrom("dummyKey", StandardCharsets.UTF_8), 123), 17L, workToken) .forFamily(STATE_FAMILY), @@ -240,7 +239,7 @@ public void resetUnderTest() { .forComputation("comp") .forKey( WindmillComputationKey.create( - "comp", ByteString.copyFrom("dummyNewKey", Charsets.UTF_8), 123), + "comp", ByteString.copyFrom("dummyNewKey", StandardCharsets.UTF_8), 123), 17L, workToken) .forFamily(STATE_FAMILY), @@ -255,7 +254,7 @@ public void resetUnderTest() { .forComputation("comp") .forKey( WindmillComputationKey.create( - "comp", ByteString.copyFrom("dummyNewKey", Charsets.UTF_8), 123), + "comp", ByteString.copyFrom("dummyNewKey", StandardCharsets.UTF_8), 123), 17L, workToken) .forFamily(STATE_FAMILY), @@ -2004,7 +2003,9 @@ false, key(NAMESPACE, tag), STATE_FAMILY, VarIntCoder.of())) } // clear cache and recreate multimapState - cache.forComputation("comp").invalidate(ByteString.copyFrom("dummyKey", Charsets.UTF_8), 123); + cache + .forComputation("comp") + .invalidate(ByteString.copyFrom("dummyKey", StandardCharsets.UTF_8), 123); resetUnderTest(); multimapState = underTest.state(NAMESPACE, addr); diff --git a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateReaderTest.java b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateReaderTest.java index 8dbfc35192b7d..b06d88bf4bc4e 100644 --- a/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateReaderTest.java +++ b/runners/google-cloud-dataflow-java/worker/src/test/java/org/apache/beam/runners/dataflow/worker/windmill/state/WindmillStateReaderTest.java @@ -27,6 +27,7 @@ import com.google.api.client.util.Lists; import com.google.common.collect.Maps; import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.util.AbstractMap; import java.util.ArrayList; import java.util.Arrays; @@ -48,7 +49,6 @@ import org.apache.beam.sdk.util.ByteStringOutputStream; import org.apache.beam.sdk.values.TimestampedValue; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Range; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.io.BaseEncoding; @@ -1151,8 +1151,8 @@ public void testReadSortedListWithContinuations() throws Exception { .addFetchRanges(SortedListRange.newBuilder().setStart(beginning).setLimit(end)) .setFetchMaxBytes(WindmillStateReader.MAX_ORDERED_LIST_BYTES)); - final ByteString CONT_1 = ByteString.copyFrom("CONTINUATION_1", Charsets.UTF_8); - final ByteString CONT_2 = ByteString.copyFrom("CONTINUATION_2", Charsets.UTF_8); + final ByteString CONT_1 = ByteString.copyFrom("CONTINUATION_1", StandardCharsets.UTF_8); + final ByteString CONT_2 = ByteString.copyFrom("CONTINUATION_2", StandardCharsets.UTF_8); Windmill.KeyedGetDataResponse.Builder response1 = Windmill.KeyedGetDataResponse.newBuilder() .setKey(DATA_KEY) @@ -1327,7 +1327,7 @@ public void testReadTagValuePrefixWithContinuations() throws Exception { .setStateFamily(STATE_FAMILY) .setFetchMaxBytes(WindmillStateReader.MAX_TAG_VALUE_PREFIX_BYTES)); - final ByteString CONT = ByteString.copyFrom("CONTINUATION", Charsets.UTF_8); + final ByteString CONT = ByteString.copyFrom("CONTINUATION", StandardCharsets.UTF_8); Windmill.KeyedGetDataResponse.Builder response1 = Windmill.KeyedGetDataResponse.newBuilder() .setKey(DATA_KEY) diff --git a/runners/java-fn-execution/src/main/java/org/apache/beam/runners/fnexecution/artifact/ArtifactStagingService.java b/runners/java-fn-execution/src/main/java/org/apache/beam/runners/fnexecution/artifact/ArtifactStagingService.java index a8e5e2ab6a882..8c7a356b99392 100644 --- a/runners/java-fn-execution/src/main/java/org/apache/beam/runners/fnexecution/artifact/ArtifactStagingService.java +++ b/runners/java-fn-execution/src/main/java/org/apache/beam/runners/fnexecution/artifact/ArtifactStagingService.java @@ -22,6 +22,7 @@ import java.io.IOException; import java.io.OutputStream; import java.nio.channels.Channels; +import java.nio.charset.StandardCharsets; import java.util.ArrayDeque; import java.util.ArrayList; import java.util.HashMap; @@ -56,7 +57,6 @@ import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.Status; import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.StatusException; import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.stub.StreamObserver; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Splitter; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; @@ -192,7 +192,7 @@ public void removeStagedArtifacts(String stagingToken) throws IOException { private ResourceId stagingDir(String stagingToken) { return FileSystems.matchNewResource(root, true) .resolve( - Hashing.sha256().hashString(stagingToken, Charsets.UTF_8).toString(), + Hashing.sha256().hashString(stagingToken, StandardCharsets.UTF_8).toString(), ResolveOptions.StandardResolveOptions.RESOLVE_DIRECTORY); } }; diff --git a/runners/java-fn-execution/src/main/java/org/apache/beam/runners/fnexecution/state/StateRequestHandlers.java b/runners/java-fn-execution/src/main/java/org/apache/beam/runners/fnexecution/state/StateRequestHandlers.java index e2c45850dba93..4e1c31744c1a0 100644 --- a/runners/java-fn-execution/src/main/java/org/apache/beam/runners/fnexecution/state/StateRequestHandlers.java +++ b/runners/java-fn-execution/src/main/java/org/apache/beam/runners/fnexecution/state/StateRequestHandlers.java @@ -19,6 +19,7 @@ import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkState; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Collections; import java.util.EnumMap; @@ -54,7 +55,6 @@ import org.apache.beam.sdk.util.common.Reiterable; import org.apache.beam.sdk.values.KV; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; /** @@ -655,7 +655,8 @@ private BagUserStateHandler createHandl } private static BeamFnApi.ProcessBundleRequest.CacheToken createCacheToken() { - ByteString token = ByteString.copyFrom(UUID.randomUUID().toString().getBytes(Charsets.UTF_8)); + ByteString token = + ByteString.copyFrom(UUID.randomUUID().toString().getBytes(StandardCharsets.UTF_8)); return BeamFnApi.ProcessBundleRequest.CacheToken.newBuilder() .setUserState(BeamFnApi.ProcessBundleRequest.CacheToken.UserState.getDefaultInstance()) .setToken(token) diff --git a/runners/java-fn-execution/src/test/java/org/apache/beam/runners/fnexecution/artifact/ArtifactRetrievalServiceTest.java b/runners/java-fn-execution/src/test/java/org/apache/beam/runners/fnexecution/artifact/ArtifactRetrievalServiceTest.java index d6b48a936135e..4d19e87c3d11c 100644 --- a/runners/java-fn-execution/src/test/java/org/apache/beam/runners/fnexecution/artifact/ArtifactRetrievalServiceTest.java +++ b/runners/java-fn-execution/src/test/java/org/apache/beam/runners/fnexecution/artifact/ArtifactRetrievalServiceTest.java @@ -20,6 +20,7 @@ import static org.junit.Assert.assertEquals; import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; @@ -33,7 +34,6 @@ import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.inprocess.InProcessChannelBuilder; import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.inprocess.InProcessServerBuilder; import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.testing.GrpcCleanupRule; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Strings; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.junit.Before; @@ -74,7 +74,7 @@ private void stageFiles(Map files) throws IOException { for (Map.Entry entry : files.entrySet()) { Files.write( Paths.get(stagingDir.toString(), entry.getKey()), - entry.getValue().getBytes(Charsets.UTF_8)); + entry.getValue().getBytes(StandardCharsets.UTF_8)); } } diff --git a/runners/portability/java/src/main/java/org/apache/beam/runners/portability/testing/TestUniversalRunner.java b/runners/portability/java/src/main/java/org/apache/beam/runners/portability/testing/TestUniversalRunner.java index 533106869c62b..a36c1e8b2efbd 100644 --- a/runners/portability/java/src/main/java/org/apache/beam/runners/portability/testing/TestUniversalRunner.java +++ b/runners/portability/java/src/main/java/org/apache/beam/runners/portability/testing/TestUniversalRunner.java @@ -21,6 +21,7 @@ import com.google.auto.service.AutoService; import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Paths; import org.apache.beam.runners.portability.PortableRunner; @@ -33,7 +34,6 @@ import org.apache.beam.sdk.options.PortablePipelineOptions; import org.apache.beam.sdk.runners.PipelineRunnerRegistrar; import org.apache.beam.sdk.testing.TestPipelineOptions; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.hamcrest.Matchers; @@ -65,7 +65,8 @@ public PipelineResult run(Pipeline pipeline) { testOptions.setJobEndpoint( "localhost:" + new String( - Files.readAllBytes(Paths.get(localServicePortFilePath)), Charsets.UTF_8) + Files.readAllBytes(Paths.get(localServicePortFilePath)), + StandardCharsets.UTF_8) .trim()); } catch (IOException e) { throw new RuntimeException( diff --git a/sdks/java/build-tools/src/main/resources/beam/checkstyle/checkstyle.xml b/sdks/java/build-tools/src/main/resources/beam/checkstyle/checkstyle.xml index 3c4cfdfbc6f58..5cee5d2f33e2a 100644 --- a/sdks/java/build-tools/src/main/resources/beam/checkstyle/checkstyle.xml +++ b/sdks/java/build-tools/src/main/resources/beam/checkstyle/checkstyle.xml @@ -119,6 +119,14 @@ page at http://checkstyle.sourceforge.net/config.html --> + + + + + + + + diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/transforms/providers/LoggingTransformProvider.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/transforms/providers/LoggingTransformProvider.java index 25efaeae2a0ef..2908171f5c02c 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/transforms/providers/LoggingTransformProvider.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/transforms/providers/LoggingTransformProvider.java @@ -19,6 +19,7 @@ import com.google.auto.service.AutoService; import com.google.auto.value.AutoValue; +import java.nio.charset.StandardCharsets; import java.util.Collections; import java.util.List; import java.util.Map; @@ -36,7 +37,6 @@ import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.PCollectionRowTuple; import org.apache.beam.sdk.values.Row; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.slf4j.Logger; @@ -166,7 +166,7 @@ private static DoFn createDoFn(Level logLevel, String prefix, Schema r return new DoFn() { @ProcessElement public void processElement(@Element Row row, OutputReceiver out) { - String msg = prefix + new String(fn.apply(row), Charsets.UTF_8); + String msg = prefix + new String(fn.apply(row), StandardCharsets.UTF_8); // Looks like this is the best we can do. // https://stackoverflow.com/questions/2621701/setting-log-level-of-message-at-runtime-in-slf4j switch (logLevel) { diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/errorhandling/BadRecord.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/errorhandling/BadRecord.java index fd49078350c48..558f912a6b1ff 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/errorhandling/BadRecord.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/errorhandling/BadRecord.java @@ -24,6 +24,7 @@ import java.io.IOException; import java.io.PrintStream; import java.io.Serializable; +import java.nio.charset.StandardCharsets; import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.schemas.AutoValueSchema; @@ -34,7 +35,6 @@ import org.apache.beam.sdk.util.CoderUtils; import org.apache.beam.sdk.util.Preconditions; import org.apache.beam.sdk.values.TypeDescriptor; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.checkerframework.checker.nullness.qual.Nullable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -202,11 +202,11 @@ public abstract static class Builder { public Builder addExceptionStackTrace(Exception exception) throws IOException { ByteArrayOutputStream stream = new ByteArrayOutputStream(); - PrintStream printStream = new PrintStream(stream, false, Charsets.UTF_8.name()); + PrintStream printStream = new PrintStream(stream, false, StandardCharsets.UTF_8.name()); exception.printStackTrace(printStream); printStream.close(); - this.setExceptionStacktrace(new String(stream.toByteArray(), Charsets.UTF_8)); + this.setExceptionStacktrace(new String(stream.toByteArray(), StandardCharsets.UTF_8)); return this; } diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/reflect/ByteBuddyOnTimerInvokerFactory.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/reflect/ByteBuddyOnTimerInvokerFactory.java index 7b9ac7e15c2e8..e318e82513ca4 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/reflect/ByteBuddyOnTimerInvokerFactory.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/reflect/ByteBuddyOnTimerInvokerFactory.java @@ -22,6 +22,7 @@ import java.lang.reflect.Constructor; import java.lang.reflect.InvocationTargetException; +import java.nio.charset.StandardCharsets; import java.util.concurrent.ExecutionException; import net.bytebuddy.ByteBuddy; import net.bytebuddy.description.modifier.FieldManifestation; @@ -43,7 +44,6 @@ import org.apache.beam.sdk.transforms.DoFn.TimerId; import org.apache.beam.sdk.transforms.reflect.ByteBuddyDoFnInvokerFactory.DoFnMethodWithExtraParametersDelegation; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.CharMatcher; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.cache.CacheBuilder; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.cache.CacheLoader; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.cache.LoadingCache; @@ -191,7 +191,7 @@ public Constructor load(final OnTimerMethodSpecifier onTimerMethodSpecifier) "%s$%s$%s", OnTimerInvoker.class.getSimpleName(), CharMatcher.javaLetterOrDigit().retainFrom(timerId), - BaseEncoding.base64().omitPadding().encode(timerId.getBytes(Charsets.UTF_8))); + BaseEncoding.base64().omitPadding().encode(timerId.getBytes(StandardCharsets.UTF_8))); DynamicType.Builder builder = new ByteBuddy() @@ -241,7 +241,7 @@ public Constructor load(final OnTimerMethodSpecifier onTimerMethodSpecifier) "%s$%s$%s", OnTimerInvoker.class.getSimpleName(), CharMatcher.javaLetterOrDigit().retainFrom(timerId), - BaseEncoding.base64().omitPadding().encode(timerId.getBytes(Charsets.UTF_8))); + BaseEncoding.base64().omitPadding().encode(timerId.getBytes(StandardCharsets.UTF_8))); DynamicType.Builder builder = new ByteBuddy() diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/resourcehints/ResourceHints.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/resourcehints/ResourceHints.java index 2f034626acd77..527a699568f40 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/resourcehints/ResourceHints.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/resourcehints/ResourceHints.java @@ -19,6 +19,7 @@ import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkState; +import java.nio.charset.StandardCharsets; import java.util.List; import java.util.Map; import java.util.function.Function; @@ -28,7 +29,6 @@ import org.apache.beam.model.pipeline.v1.RunnerApi.StandardResourceHints; import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ProtocolMessageEnum; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Splitter; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; @@ -179,7 +179,7 @@ public ResourceHint mergeWithOuter(ResourceHint outer) { @Override public byte[] toBytes() { - return String.valueOf(value).getBytes(Charsets.US_ASCII); + return String.valueOf(value).getBytes(StandardCharsets.US_ASCII); } } @@ -196,7 +196,7 @@ public static String parse(String s) { @Override public byte[] toBytes() { - return value.getBytes(Charsets.US_ASCII); + return value.getBytes(StandardCharsets.US_ASCII); } @Override @@ -254,7 +254,7 @@ public ResourceHint mergeWithOuter(ResourceHint outer) { @Override public byte[] toBytes() { - return String.valueOf(value).getBytes(Charsets.US_ASCII); + return String.valueOf(value).getBytes(StandardCharsets.US_ASCII); } } diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/coders/StructuralByteArrayTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/coders/StructuralByteArrayTest.java index bd8fdd84fb096..cb0845796fe9c 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/coders/StructuralByteArrayTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/coders/StructuralByteArrayTest.java @@ -20,7 +20,7 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; +import java.nio.charset.StandardCharsets; import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.JUnit4; @@ -32,10 +32,10 @@ public final class StructuralByteArrayTest { @Test public void testStructuralByteArray() throws Exception { assertEquals( - new StructuralByteArray("test string".getBytes(Charsets.UTF_8)), - new StructuralByteArray("test string".getBytes(Charsets.UTF_8))); + new StructuralByteArray("test string".getBytes(StandardCharsets.UTF_8)), + new StructuralByteArray("test string".getBytes(StandardCharsets.UTF_8))); assertFalse( - new StructuralByteArray("test string".getBytes(Charsets.UTF_8)) - .equals(new StructuralByteArray("diff string".getBytes(Charsets.UTF_8)))); + new StructuralByteArray("test string".getBytes(StandardCharsets.UTF_8)) + .equals(new StructuralByteArray("diff string".getBytes(StandardCharsets.UTF_8)))); } } diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/io/FileBasedSinkTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/io/FileBasedSinkTest.java index 7fd54039b1dda..c4f83954e66cc 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/io/FileBasedSinkTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/io/FileBasedSinkTest.java @@ -17,8 +17,8 @@ */ package org.apache.beam.sdk.io; +import static java.nio.charset.StandardCharsets.UTF_8; import static org.apache.beam.sdk.io.WriteFiles.UNKNOWN_SHARDNUM; -import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets.UTF_8; import static org.hamcrest.MatcherAssert.assertThat; import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.is; diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/io/FileIOTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/io/FileIOTest.java index b87c9caa12441..90b0822d9dcaa 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/io/FileIOTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/io/FileIOTest.java @@ -31,6 +31,7 @@ import java.io.OutputStreamWriter; import java.io.Serializable; import java.io.Writer; +import java.nio.charset.StandardCharsets; import java.nio.file.CopyOption; import java.nio.file.Files; import java.nio.file.Path; @@ -69,7 +70,6 @@ import org.apache.beam.sdk.values.PCollectionView; import org.apache.beam.sdk.values.TypeDescriptor; import org.apache.beam.sdk.values.TypeDescriptors; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.joda.time.Duration; import org.junit.Rule; import org.junit.Test; @@ -368,10 +368,10 @@ public void testMatchWatchForNewFiles() throws IOException, InterruptedException public void testRead() throws IOException { final String path = tmpFolder.newFile("file").getAbsolutePath(); final String pathGZ = tmpFolder.newFile("file.gz").getAbsolutePath(); - Files.write(new File(path).toPath(), "Hello world".getBytes(Charsets.UTF_8)); + Files.write(new File(path).toPath(), "Hello world".getBytes(StandardCharsets.UTF_8)); try (Writer writer = new OutputStreamWriter( - new GZIPOutputStream(new FileOutputStream(pathGZ)), Charsets.UTF_8)) { + new GZIPOutputStream(new FileOutputStream(pathGZ)), StandardCharsets.UTF_8)) { writer.write("Hello world"); } diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/io/TFRecordIOTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/io/TFRecordIOTest.java index acde8c91431da..a38faf077e073 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/io/TFRecordIOTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/io/TFRecordIOTest.java @@ -67,7 +67,6 @@ import org.apache.beam.sdk.transforms.ParDo; import org.apache.beam.sdk.transforms.display.DisplayData; import org.apache.beam.sdk.values.PCollection; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Lists; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.io.BaseEncoding; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.io.ByteStreams; @@ -212,7 +211,7 @@ public void testWriteTwo() throws Exception { @Category(NeedsRunner.class) public void testReadInvalidRecord() throws Exception { expectedException.expectMessage("Not a valid TFRecord. Fewer than 12 bytes."); - runTestRead("bar".getBytes(Charsets.UTF_8), new String[0]); + runTestRead("bar".getBytes(StandardCharsets.UTF_8), new String[0]); } @Test @@ -445,14 +444,14 @@ private static Iterable makeLines(int n, int minRecordSize) { static class ByteArrayToString extends DoFn { @ProcessElement public void processElement(ProcessContext c) { - c.output(new String(c.element(), Charsets.UTF_8)); + c.output(new String(c.element(), StandardCharsets.UTF_8)); } } static class StringToByteArray extends DoFn { @ProcessElement public void processElement(ProcessContext c) { - c.output(c.element().getBytes(Charsets.UTF_8)); + c.output(c.element().getBytes(StandardCharsets.UTF_8)); } } diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/io/TextIOReadTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/io/TextIOReadTest.java index 253308d1b93f0..8d9adbefd02bf 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/io/TextIOReadTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/io/TextIOReadTest.java @@ -51,6 +51,7 @@ import java.nio.channels.Channels; import java.nio.channels.ReadableByteChannel; import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; @@ -91,7 +92,6 @@ import org.apache.beam.sdk.util.CoderUtils; import org.apache.beam.sdk.values.KV; import org.apache.beam.sdk.values.PCollection; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Joiner; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; @@ -642,7 +642,7 @@ private void runTestRead(String[] expected) throws Exception { try (PrintStream writer = new PrintStream(new FileOutputStream(tmpFile))) { for (String elem : expected) { byte[] encodedElem = CoderUtils.encodeToByteArray(StringUtf8Coder.of(), elem); - String line = new String(encodedElem, Charsets.UTF_8); + String line = new String(encodedElem, StandardCharsets.UTF_8); writer.println(line); } } @@ -865,7 +865,7 @@ public void testProgressEmptyFile() throws IOException { public void testProgressTextFile() throws IOException { String file = "line1\nline2\nline3"; try (BoundedSource.BoundedReader reader = - prepareSource(file.getBytes(Charsets.UTF_8)) + prepareSource(file.getBytes(StandardCharsets.UTF_8)) .createReader(PipelineOptionsFactory.create())) { // Check preconditions before starting assertEquals(0.0, reader.getFractionConsumed(), 1e-6); @@ -901,7 +901,7 @@ public void testProgressTextFile() throws IOException { @Test public void testProgressAfterSplitting() throws IOException { String file = "line1\nline2\nline3"; - BoundedSource source = prepareSource(file.getBytes(Charsets.UTF_8)); + BoundedSource source = prepareSource(file.getBytes(StandardCharsets.UTF_8)); BoundedSource remainder; // Create the remainder, verifying properties pre- and post-splitting. diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/io/TextIOWriteTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/io/TextIOWriteTest.java index 312605f3fcc5e..695ff4474d715 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/io/TextIOWriteTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/io/TextIOWriteTest.java @@ -39,6 +39,7 @@ import java.io.OutputStream; import java.nio.channels.Channels; import java.nio.channels.WritableByteChannel; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; @@ -73,7 +74,6 @@ import org.apache.beam.sdk.transforms.windowing.Window; import org.apache.beam.sdk.util.CoderUtils; import org.apache.beam.sdk.values.PCollection; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Function; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Functions; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Predicate; @@ -492,7 +492,7 @@ private static void assertOutputFiles( List expectedElements = new ArrayList<>(elems.length); for (String elem : elems) { byte[] encodedElem = CoderUtils.encodeToByteArray(StringUtf8Coder.of(), elem); - String line = new String(encodedElem, Charsets.UTF_8); + String line = new String(encodedElem, StandardCharsets.UTF_8); expectedElements.add(line); } @@ -509,7 +509,7 @@ private static void assertOutputFiles( private static List readLinesFromFile(File f) throws IOException { List currentFile = new ArrayList<>(); - try (BufferedReader reader = Files.newBufferedReader(f.toPath(), Charsets.UTF_8)) { + try (BufferedReader reader = Files.newBufferedReader(f.toPath(), StandardCharsets.UTF_8)) { while (true) { String line = reader.readLine(); if (line == null) { diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/io/TextRowCountEstimatorTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/io/TextRowCountEstimatorTest.java index 17ca3ba85fd81..e52d4112e11e2 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/io/TextRowCountEstimatorTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/io/TextRowCountEstimatorTest.java @@ -20,8 +20,8 @@ import java.io.File; import java.io.FileNotFoundException; import java.io.Writer; +import java.nio.charset.StandardCharsets; import org.apache.beam.sdk.options.PipelineOptionsFactory; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.io.Files; import org.junit.Assert; import org.junit.Rule; @@ -38,7 +38,7 @@ public class TextRowCountEstimatorTest { @Test public void testNonEmptyFiles() throws Exception { File file1 = temporaryFolder.newFile("file1.txt"); - Writer writer = Files.newWriter(file1, Charsets.UTF_8); + Writer writer = Files.newWriter(file1, StandardCharsets.UTF_8); for (int i = 0; i < 100; i++) { writer.write("123123123\n"); } @@ -47,7 +47,7 @@ public void testNonEmptyFiles() throws Exception { temporaryFolder.newFolder("testfolder"); temporaryFolder.newFolder("testfolder2"); file1 = temporaryFolder.newFile("testfolder/test2.txt"); - writer = Files.newWriter(file1, Charsets.UTF_8); + writer = Files.newWriter(file1, StandardCharsets.UTF_8); for (int i = 0; i < 50; i++) { writer.write("123123123\n"); } @@ -71,7 +71,7 @@ public void testEmptyFolder() throws Exception { @Test public void testEmptyFile() throws Exception { File file1 = temporaryFolder.newFile("file1.txt"); - Writer writer = Files.newWriter(file1, Charsets.UTF_8); + Writer writer = Files.newWriter(file1, StandardCharsets.UTF_8); for (int i = 0; i < 100; i++) { writer.write("\n"); } @@ -86,7 +86,7 @@ public void testEmptyFile() throws Exception { @Test(expected = TextRowCountEstimator.NoEstimationException.class) public void lotsOfNewLines() throws Exception { File file1 = temporaryFolder.newFile("file1.txt"); - Writer writer = Files.newWriter(file1, Charsets.UTF_8); + Writer writer = Files.newWriter(file1, StandardCharsets.UTF_8); for (int i = 0; i < 1000; i++) { writer.write("\n"); } diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/io/WriteFilesTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/io/WriteFilesTest.java index 0ab8efac7eb1a..cc174002bb464 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/io/WriteFilesTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/io/WriteFilesTest.java @@ -35,6 +35,7 @@ import java.io.BufferedReader; import java.io.File; import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.text.DecimalFormat; import java.util.ArrayList; @@ -93,7 +94,6 @@ import org.apache.beam.sdk.values.PCollectionView; import org.apache.beam.sdk.values.PDone; import org.apache.beam.sdk.values.ShardedKey; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Optional; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Lists; @@ -1035,7 +1035,8 @@ static void checkFileContents( List actual = Lists.newArrayList(); for (File outputFile : outputFiles) { List actualShard = Lists.newArrayList(); - try (BufferedReader reader = Files.newBufferedReader(outputFile.toPath(), Charsets.UTF_8)) { + try (BufferedReader reader = + Files.newBufferedReader(outputFile.toPath(), StandardCharsets.UTF_8)) { for (; ; ) { String line = reader.readLine(); if (line == null) { diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/options/PipelineOptionsFactoryTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/options/PipelineOptionsFactoryTest.java index 2643fb556ff47..291bb52978808 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/options/PipelineOptionsFactoryTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/options/PipelineOptionsFactoryTest.java @@ -59,6 +59,7 @@ import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.PrintStream; +import java.nio.charset.StandardCharsets; import java.util.Collection; import java.util.List; import java.util.Map; @@ -74,7 +75,6 @@ import org.apache.beam.sdk.testing.InterceptingUrlClassLoader; import org.apache.beam.sdk.testing.RestoreSystemProperties; import org.apache.beam.sdk.util.common.ReflectHelpers; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ArrayListMultimap; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Collections2; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; @@ -1727,7 +1727,7 @@ public void testWhenNoHelpIsRequested() { assertFalse( PipelineOptionsFactory.printHelpUsageAndExitIfNeeded( arguments, new PrintStream(baos), false /* exit */)); - String output = new String(baos.toByteArray(), Charsets.UTF_8); + String output = new String(baos.toByteArray(), StandardCharsets.UTF_8); assertEquals("", output); } @@ -1739,7 +1739,7 @@ public void testDefaultHelpAsArgument() { assertTrue( PipelineOptionsFactory.printHelpUsageAndExitIfNeeded( arguments, new PrintStream(baos), false /* exit */)); - String output = new String(baos.toByteArray(), Charsets.UTF_8); + String output = new String(baos.toByteArray(), StandardCharsets.UTF_8); assertThat(output, containsString("The set of registered options are:")); assertThat(output, containsString("org.apache.beam.sdk.options.PipelineOptions")); assertThat(output, containsString("Use --help= for detailed help.")); @@ -1753,7 +1753,7 @@ public void testSpecificHelpAsArgument() { assertTrue( PipelineOptionsFactory.printHelpUsageAndExitIfNeeded( arguments, new PrintStream(baos), false /* exit */)); - String output = new String(baos.toByteArray(), Charsets.UTF_8); + String output = new String(baos.toByteArray(), StandardCharsets.UTF_8); assertThat(output, containsString("org.apache.beam.sdk.options.PipelineOptions")); assertThat(output, containsString("--runner")); assertThat(output, containsString("Default: " + DEFAULT_RUNNER_NAME)); @@ -1769,7 +1769,7 @@ public void testSpecificHelpAsArgumentWithSimpleClassName() { assertTrue( PipelineOptionsFactory.printHelpUsageAndExitIfNeeded( arguments, new PrintStream(baos), false /* exit */)); - String output = new String(baos.toByteArray(), Charsets.UTF_8); + String output = new String(baos.toByteArray(), StandardCharsets.UTF_8); assertThat(output, containsString("org.apache.beam.sdk.options.PipelineOptions")); assertThat(output, containsString("--runner")); assertThat(output, containsString("Default: " + DEFAULT_RUNNER_NAME)); @@ -1785,7 +1785,7 @@ public void testSpecificHelpAsArgumentWithClassNameSuffix() { assertTrue( PipelineOptionsFactory.printHelpUsageAndExitIfNeeded( arguments, new PrintStream(baos), false /* exit */)); - String output = new String(baos.toByteArray(), Charsets.UTF_8); + String output = new String(baos.toByteArray(), StandardCharsets.UTF_8); assertThat(output, containsString("org.apache.beam.sdk.options.PipelineOptions")); assertThat(output, containsString("--runner")); assertThat(output, containsString("Default: " + DEFAULT_RUNNER_NAME)); @@ -1815,7 +1815,7 @@ public void testShortnameSpecificHelpHasMultipleMatches() { assertTrue( PipelineOptionsFactory.printHelpUsageAndExitIfNeeded( arguments, new PrintStream(baos), false /* exit */)); - String output = new String(baos.toByteArray(), Charsets.UTF_8); + String output = new String(baos.toByteArray(), StandardCharsets.UTF_8); assertThat(output, containsString("Multiple matches found for NameConflict")); assertThat( output, @@ -1839,7 +1839,7 @@ public void testHelpWithOptionThatOutputsValidEnumTypes() { assertTrue( PipelineOptionsFactory.printHelpUsageAndExitIfNeeded( arguments, new PrintStream(baos), false /* exit */)); - String output = new String(baos.toByteArray(), Charsets.UTF_8); + String output = new String(baos.toByteArray(), StandardCharsets.UTF_8); assertThat(output, containsString("")); } @@ -1851,7 +1851,7 @@ public void testHelpWithBadOptionNameAsArgument() { assertTrue( PipelineOptionsFactory.printHelpUsageAndExitIfNeeded( arguments, new PrintStream(baos), false /* exit */)); - String output = new String(baos.toByteArray(), Charsets.UTF_8); + String output = new String(baos.toByteArray(), StandardCharsets.UTF_8); assertThat(output, containsString("Unable to find option org.apache.beam.sdk.Pipeline")); assertThat(output, containsString("The set of registered options are:")); assertThat(output, containsString("org.apache.beam.sdk.options.PipelineOptions")); @@ -1865,7 +1865,7 @@ public void testHelpWithHiddenMethodAndInterface() { assertTrue( PipelineOptionsFactory.printHelpUsageAndExitIfNeeded( arguments, new PrintStream(baos), false /* exit */)); - String output = new String(baos.toByteArray(), Charsets.UTF_8); + String output = new String(baos.toByteArray(), StandardCharsets.UTF_8); // A hidden interface. assertThat( output, not(containsString("org.apache.beam.sdk.options.DataflowPipelineDebugOptions"))); @@ -1877,7 +1877,7 @@ public void testHelpWithHiddenMethodAndInterface() { public void testProgrammaticPrintHelp() { ByteArrayOutputStream baos = new ByteArrayOutputStream(); PipelineOptionsFactory.printHelp(new PrintStream(baos)); - String output = new String(baos.toByteArray(), Charsets.UTF_8); + String output = new String(baos.toByteArray(), StandardCharsets.UTF_8); assertThat(output, containsString("The set of registered options are:")); assertThat(output, containsString("org.apache.beam.sdk.options.PipelineOptions")); } @@ -1886,7 +1886,7 @@ public void testProgrammaticPrintHelp() { public void testProgrammaticPrintHelpForSpecificType() { ByteArrayOutputStream baos = new ByteArrayOutputStream(); PipelineOptionsFactory.printHelp(new PrintStream(baos), PipelineOptions.class); - String output = new String(baos.toByteArray(), Charsets.UTF_8); + String output = new String(baos.toByteArray(), StandardCharsets.UTF_8); assertThat(output, containsString("org.apache.beam.sdk.options.PipelineOptions")); assertThat(output, containsString("--runner")); assertThat(output, containsString("Default: " + DEFAULT_RUNNER_NAME)); diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/SchemaTranslationTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/SchemaTranslationTest.java index bd7a0da394ae5..3b22addbf5455 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/SchemaTranslationTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/SchemaTranslationTest.java @@ -24,6 +24,7 @@ import static org.junit.Assert.assertThrows; import java.math.BigDecimal; +import java.nio.charset.StandardCharsets; import java.time.LocalDateTime; import java.util.ArrayList; import java.util.HashMap; @@ -54,7 +55,6 @@ import org.apache.beam.sdk.schemas.logicaltypes.VariableString; import org.apache.beam.sdk.values.Row; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.joda.time.Instant; import org.junit.Test; @@ -243,7 +243,8 @@ public static Iterable data() { .setUrn("pythonsdk:value") .setPayload( ByteString.copyFrom( - "some payload describing a python type", Charsets.UTF_8)) + "some payload describing a python type", + StandardCharsets.UTF_8)) .setRepresentation( SchemaApi.FieldType.newBuilder() .setAtomicType(SchemaApi.AtomicType.BYTES)) diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/util/BufferedElementCountingOutputStreamTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/util/BufferedElementCountingOutputStreamTest.java index 5298d29dad101..0c9e0065f5a64 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/util/BufferedElementCountingOutputStreamTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/util/BufferedElementCountingOutputStreamTest.java @@ -29,12 +29,12 @@ import java.io.InputStream; import java.io.OutputStream; import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Random; import org.apache.beam.sdk.coders.ByteArrayCoder; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.hamcrest.collection.IsIterableContainingInOrder; import org.junit.Rule; @@ -165,7 +165,7 @@ public void testWritingByteWhenFinishedThrows() throws Exception { public void testWritingBytesWhenFinishedThrows() throws Exception { expectedException.expect(IOException.class); expectedException.expectMessage("Stream has been finished."); - testValues(toBytes("a")).write("b".getBytes(Charsets.UTF_8)); + testValues(toBytes("a")).write("b".getBytes(StandardCharsets.UTF_8)); } @Test @@ -203,7 +203,7 @@ public void testBehaviorWhenBufferPoolEmpty() throws Exception { private List toBytes(String... values) { ImmutableList.Builder builder = ImmutableList.builder(); for (String value : values) { - builder.add(value.getBytes(Charsets.UTF_8)); + builder.add(value.getBytes(StandardCharsets.UTF_8)); } return builder.build(); } diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/util/ExposedByteArrayInputStreamTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/util/ExposedByteArrayInputStreamTest.java index e87f6a2b0d0a6..d26794274653a 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/util/ExposedByteArrayInputStreamTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/util/ExposedByteArrayInputStreamTest.java @@ -24,7 +24,7 @@ import java.io.ByteArrayInputStream; import java.io.IOException; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; +import java.nio.charset.StandardCharsets; import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.JUnit4; @@ -33,7 +33,7 @@ @RunWith(JUnit4.class) public class ExposedByteArrayInputStreamTest { - private static final byte[] TEST_DATA = "Hello World!".getBytes(Charsets.UTF_8); + private static final byte[] TEST_DATA = "Hello World!".getBytes(StandardCharsets.UTF_8); private ByteArrayInputStream stream = new ByteArrayInputStream(TEST_DATA); @@ -74,6 +74,6 @@ public void testReadPartial() throws IOException { public void testReadAllAfterReadPartial() throws IOException { assertNotEquals(-1, exposedStream.read()); byte[] ret = exposedStream.readAll(); - assertArrayEquals("ello World!".getBytes(Charsets.UTF_8), ret); + assertArrayEquals("ello World!".getBytes(StandardCharsets.UTF_8), ret); } } diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/util/ExposedByteArrayOutputStreamTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/util/ExposedByteArrayOutputStreamTest.java index 7e1b213c85b25..a4a105a89ddc7 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/util/ExposedByteArrayOutputStreamTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/util/ExposedByteArrayOutputStreamTest.java @@ -25,7 +25,7 @@ import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.UnsupportedEncodingException; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; +import java.nio.charset.StandardCharsets; import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.JUnit4; @@ -34,7 +34,7 @@ @RunWith(JUnit4.class) public class ExposedByteArrayOutputStreamTest { - private static final byte[] TEST_DATA = "Hello World!".getBytes(Charsets.UTF_8); + private static final byte[] TEST_DATA = "Hello World!".getBytes(StandardCharsets.UTF_8); private ExposedByteArrayOutputStream exposedStream = new ExposedByteArrayOutputStream(); private ByteArrayOutputStream stream = new ByteArrayOutputStream(); diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/util/SerializableUtilsTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/util/SerializableUtilsTest.java index e15bd42dc3ce3..1f3ec0f427b4a 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/util/SerializableUtilsTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/util/SerializableUtilsTest.java @@ -24,12 +24,12 @@ import java.io.InputStream; import java.io.OutputStream; import java.io.Serializable; +import java.nio.charset.StandardCharsets; import java.util.List; import org.apache.beam.sdk.coders.AtomicCoder; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.coders.CoderException; import org.apache.beam.sdk.testing.InterceptingUrlClassLoader; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.junit.Rule; import org.junit.Test; @@ -93,7 +93,7 @@ public void testDeserializationError() { expectedException.expect(IllegalArgumentException.class); expectedException.expectMessage("unable to deserialize a bogus string"); SerializableUtils.deserializeFromByteArray( - "this isn't legal".getBytes(Charsets.UTF_8), "a bogus string"); + "this isn't legal".getBytes(StandardCharsets.UTF_8), "a bogus string"); } /** A class that is not serializable by Java. */ diff --git a/sdks/java/expansion-service/src/test/java/org/apache/beam/sdk/expansion/service/ExpansionServiceTest.java b/sdks/java/expansion-service/src/test/java/org/apache/beam/sdk/expansion/service/ExpansionServiceTest.java index 3bd87c2ae5c75..1c8d515d5c85e 100644 --- a/sdks/java/expansion-service/src/test/java/org/apache/beam/sdk/expansion/service/ExpansionServiceTest.java +++ b/sdks/java/expansion-service/src/test/java/org/apache/beam/sdk/expansion/service/ExpansionServiceTest.java @@ -33,6 +33,7 @@ import com.google.auto.value.AutoValue; import java.io.IOException; import java.net.URL; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Paths; import java.util.ArrayList; @@ -60,7 +61,6 @@ import org.apache.beam.sdk.util.ByteStringOutputStream; import org.apache.beam.sdk.util.construction.PipelineTranslation; import org.apache.beam.sdk.values.Row; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; @@ -84,7 +84,7 @@ public class ExpansionServiceTest { private ExpansionService expansionService = new ExpansionService(); public static final List BYTE_LIST = ImmutableList.of("testing", "compound", "coders").stream() - .map(str -> str.getBytes(Charsets.UTF_8)) + .map(str -> str.getBytes(StandardCharsets.UTF_8)) .collect(Collectors.toList()); public static final Map BYTE_KV_LIST = ImmutableList.of("testing", "compound", "coders").stream() diff --git a/sdks/java/extensions/avro/src/main/java/org/apache/beam/sdk/extensions/avro/AvroGenericCoderTranslator.java b/sdks/java/extensions/avro/src/main/java/org/apache/beam/sdk/extensions/avro/AvroGenericCoderTranslator.java index 67f386411d810..e56b95d7f8a6b 100644 --- a/sdks/java/extensions/avro/src/main/java/org/apache/beam/sdk/extensions/avro/AvroGenericCoderTranslator.java +++ b/sdks/java/extensions/avro/src/main/java/org/apache/beam/sdk/extensions/avro/AvroGenericCoderTranslator.java @@ -17,6 +17,7 @@ */ package org.apache.beam.sdk.extensions.avro; +import java.nio.charset.StandardCharsets; import java.util.Collections; import java.util.List; import org.apache.avro.Schema; @@ -24,7 +25,6 @@ import org.apache.beam.sdk.extensions.avro.coders.AvroGenericCoder; import org.apache.beam.sdk.util.construction.CoderTranslation.TranslationContext; import org.apache.beam.sdk.util.construction.CoderTranslator; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; /** Coder translator for AvroGenericCoder. */ public class AvroGenericCoderTranslator implements CoderTranslator { @@ -35,13 +35,13 @@ public List> getComponents(AvroGenericCoder from) { @Override public byte[] getPayload(AvroGenericCoder from) { - return from.getSchema().toString().getBytes(Charsets.UTF_8); + return from.getSchema().toString().getBytes(StandardCharsets.UTF_8); } @Override public AvroGenericCoder fromComponents( List> components, byte[] payload, TranslationContext context) { - Schema schema = new Schema.Parser().parse(new String(payload, Charsets.UTF_8)); + Schema schema = new Schema.Parser().parse(new String(payload, StandardCharsets.UTF_8)); return AvroGenericCoder.of(schema); } } diff --git a/sdks/java/extensions/avro/src/test/java/org/apache/beam/sdk/extensions/avro/io/AvroIOTest.java b/sdks/java/extensions/avro/src/test/java/org/apache/beam/sdk/extensions/avro/io/AvroIOTest.java index 30a1a77872520..2a0bc36f6e9eb 100644 --- a/sdks/java/extensions/avro/src/test/java/org/apache/beam/sdk/extensions/avro/io/AvroIOTest.java +++ b/sdks/java/extensions/avro/src/test/java/org/apache/beam/sdk/extensions/avro/io/AvroIOTest.java @@ -36,6 +36,7 @@ import java.io.InputStream; import java.io.OutputStream; import java.io.Serializable; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; @@ -100,7 +101,6 @@ import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.PCollectionView; import org.apache.beam.sdk.values.TimestampedValue; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.MoreObjects; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ArrayListMultimap; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; @@ -1436,7 +1436,7 @@ public void testMetadata() throws Exception { "longKey", 100L, "bytesKey", - "bytesValue".getBytes(Charsets.UTF_8)))); + "bytesValue".getBytes(StandardCharsets.UTF_8)))); writePipeline.run(); try (DataFileStream dataFileStream = @@ -1444,7 +1444,7 @@ public void testMetadata() throws Exception { assertEquals("stringValue", dataFileStream.getMetaString("stringKey")); assertEquals(100L, dataFileStream.getMetaLong("longKey")); assertArrayEquals( - "bytesValue".getBytes(Charsets.UTF_8), dataFileStream.getMeta("bytesKey")); + "bytesValue".getBytes(StandardCharsets.UTF_8), dataFileStream.getMeta("bytesKey")); } } diff --git a/sdks/java/extensions/python/src/main/java/org/apache/beam/sdk/extensions/python/PythonExternalTransform.java b/sdks/java/extensions/python/src/main/java/org/apache/beam/sdk/extensions/python/PythonExternalTransform.java index e4e5f35334456..c23a771f3cc8b 100644 --- a/sdks/java/extensions/python/src/main/java/org/apache/beam/sdk/extensions/python/PythonExternalTransform.java +++ b/sdks/java/extensions/python/src/main/java/org/apache/beam/sdk/extensions/python/PythonExternalTransform.java @@ -22,6 +22,7 @@ import java.io.IOException; import java.io.OutputStreamWriter; import java.io.Writer; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; @@ -61,7 +62,6 @@ import org.apache.beam.sdk.values.TypeDescriptor; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Strings; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; @@ -499,7 +499,8 @@ public OutputT expand(InputT input) { requirementsFile.deleteOnExit(); try (Writer fout = new OutputStreamWriter( - new FileOutputStream(requirementsFile.getAbsolutePath()), Charsets.UTF_8)) { + new FileOutputStream(requirementsFile.getAbsolutePath()), + StandardCharsets.UTF_8)) { for (String pkg : extraPackages) { fout.write(pkg); fout.write('\n'); diff --git a/sdks/java/extensions/python/src/main/java/org/apache/beam/sdk/extensions/python/PythonService.java b/sdks/java/extensions/python/src/main/java/org/apache/beam/sdk/extensions/python/PythonService.java index 4392f23c46360..ab4d02ec838d0 100644 --- a/sdks/java/extensions/python/src/main/java/org/apache/beam/sdk/extensions/python/PythonService.java +++ b/sdks/java/extensions/python/src/main/java/org/apache/beam/sdk/extensions/python/PythonService.java @@ -24,13 +24,13 @@ import java.io.InputStreamReader; import java.net.ServerSocket; import java.net.Socket; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.concurrent.TimeoutException; import org.apache.beam.sdk.util.ReleaseInfo; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.io.ByteStreams; import org.slf4j.Logger; @@ -106,7 +106,8 @@ public AutoCloseable start() throws IOException, InterruptedException { new ProcessBuilder(bootstrapCommand).redirectError(ProcessBuilder.Redirect.INHERIT).start(); bootstrap.getOutputStream().close(); BufferedReader reader = - new BufferedReader(new InputStreamReader(bootstrap.getInputStream(), Charsets.UTF_8)); + new BufferedReader( + new InputStreamReader(bootstrap.getInputStream(), StandardCharsets.UTF_8)); String lastLine = reader.readLine(); String lastNonEmptyLine = lastLine; while (lastLine != null) { diff --git a/sdks/java/extensions/sql/jdbc/build.gradle b/sdks/java/extensions/sql/jdbc/build.gradle index 41fddce7116ab..c5d462e0f5cad 100644 --- a/sdks/java/extensions/sql/jdbc/build.gradle +++ b/sdks/java/extensions/sql/jdbc/build.gradle @@ -35,11 +35,11 @@ dependencies { implementation "jline:jline:2.14.6" permitUnusedDeclared "jline:jline:2.14.6" // BEAM-11761 implementation "sqlline:sqlline:1.4.0" - implementation library.java.vendored_guava_32_1_2_jre implementation library.java.vendored_calcite_1_28_0 permitUnusedDeclared library.java.vendored_calcite_1_28_0 testImplementation project(path: ":sdks:java:io:google-cloud-platform", configuration: "testRuntimeMigration") testImplementation library.java.junit + testImplementation library.java.vendored_guava_32_1_2_jre // Depending on outputs so integrationTest can run with only test dependencies. // This enables us to test the JDBC jar being loaded on a custom classloader. integrationTest sourceSets.test.output diff --git a/sdks/java/extensions/sql/jdbc/src/main/java/org/apache/beam/sdk/extensions/sql/jdbc/BeamSqlLine.java b/sdks/java/extensions/sql/jdbc/src/main/java/org/apache/beam/sdk/extensions/sql/jdbc/BeamSqlLine.java index ac049608ebcbd..8c87343cd7c11 100644 --- a/sdks/java/extensions/sql/jdbc/src/main/java/org/apache/beam/sdk/extensions/sql/jdbc/BeamSqlLine.java +++ b/sdks/java/extensions/sql/jdbc/src/main/java/org/apache/beam/sdk/extensions/sql/jdbc/BeamSqlLine.java @@ -23,10 +23,10 @@ import java.io.InputStream; import java.io.OutputStream; import java.io.PrintStream; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; import java.util.List; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.checkerframework.checker.nullness.qual.Nullable; import sqlline.SqlLine; import sqlline.SqlLine.Status; @@ -68,11 +68,11 @@ static Status runSqlLine( SqlLine sqlLine = new SqlLine(); if (outputStream != null) { - sqlLine.setOutputStream(new PrintStream(outputStream, false, Charsets.UTF_8.name())); + sqlLine.setOutputStream(new PrintStream(outputStream, false, StandardCharsets.UTF_8.name())); } if (errorStream != null) { - sqlLine.setErrorStream(new PrintStream(errorStream, false, Charsets.UTF_8.name())); + sqlLine.setErrorStream(new PrintStream(errorStream, false, StandardCharsets.UTF_8.name())); } return sqlLine.begin(modifiedArgs, inputStream, true); diff --git a/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/meta/provider/text/TextTableProviderTest.java b/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/meta/provider/text/TextTableProviderTest.java index e5a46f877001a..e34106db1d936 100644 --- a/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/meta/provider/text/TextTableProviderTest.java +++ b/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/meta/provider/text/TextTableProviderTest.java @@ -21,6 +21,7 @@ import static org.hamcrest.Matchers.containsInAnyOrder; import java.io.File; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import org.apache.beam.sdk.extensions.sql.SqlTransform; import org.apache.beam.sdk.schemas.Schema; @@ -33,7 +34,6 @@ import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.Row; import org.apache.beam.sdk.values.TypeDescriptors; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TemporaryFolder; @@ -81,7 +81,7 @@ protected void after() {} public void testLegacyDefaultCsv() throws Exception { Files.write( tempFolder.newFile("test.csv").toPath(), - "hello,13\n\ngoodbye,42\n".getBytes(Charsets.UTF_8)); + "hello,13\n\ngoodbye,42\n".getBytes(StandardCharsets.UTF_8)); String query = "SELECT * FROM test"; String ddl = String.format( @@ -105,7 +105,7 @@ public void testLegacyDefaultCsv() throws Exception { public void testLegacyTdfCsv() throws Exception { Files.write( tempFolder.newFile("test.csv").toPath(), - "hello\t13\n\ngoodbye\t42\n".getBytes(Charsets.UTF_8)); + "hello\t13\n\ngoodbye\t42\n".getBytes(StandardCharsets.UTF_8)); String query = "SELECT * FROM test"; String ddl = @@ -138,7 +138,7 @@ public void testLegacyTdfCsv() throws Exception { public void testExplicitCsv() throws Exception { Files.write( tempFolder.newFile("test.csv").toPath(), - "hello,13\n\ngoodbye,42\n".getBytes(Charsets.UTF_8)); + "hello,13\n\ngoodbye,42\n".getBytes(StandardCharsets.UTF_8)); String query = "SELECT * FROM test"; String ddl = @@ -165,7 +165,8 @@ public void testExplicitCsv() throws Exception { @Test public void testExplicitCsvExcel() throws Exception { Files.write( - tempFolder.newFile("test.csv").toPath(), "hello\n\ngoodbye\n".getBytes(Charsets.UTF_8)); + tempFolder.newFile("test.csv").toPath(), + "hello\n\ngoodbye\n".getBytes(StandardCharsets.UTF_8)); String query = "SELECT * FROM test"; String ddl = @@ -191,7 +192,8 @@ public void testExplicitCsvExcel() throws Exception { public void testLines() throws Exception { // Data that looks like CSV but isn't parsed as it Files.write( - tempFolder.newFile("test.csv").toPath(), "hello,13\ngoodbye,42\n".getBytes(Charsets.UTF_8)); + tempFolder.newFile("test.csv").toPath(), + "hello,13\ngoodbye,42\n".getBytes(StandardCharsets.UTF_8)); String query = "SELECT * FROM test"; String ddl = @@ -210,7 +212,8 @@ public void testLines() throws Exception { @Test public void testJson() throws Exception { - Files.write(tempFolder.newFile("test.json").toPath(), JSON_TEXT.getBytes(Charsets.UTF_8)); + Files.write( + tempFolder.newFile("test.json").toPath(), JSON_TEXT.getBytes(StandardCharsets.UTF_8)); String query = "SELECT * FROM test"; String ddl = @@ -229,7 +232,8 @@ public void testJson() throws Exception { public void testInvalidJson() throws Exception { File deadLetterFile = new File(tempFolder.getRoot(), "dead-letter-file"); Files.write( - tempFolder.newFile("test.json").toPath(), INVALID_JSON_TEXT.getBytes(Charsets.UTF_8)); + tempFolder.newFile("test.json").toPath(), + INVALID_JSON_TEXT.getBytes(StandardCharsets.UTF_8)); String query = "SELECT * FROM test"; String ddl = diff --git a/sdks/java/io/clickhouse/src/main/java/org/apache/beam/sdk/io/clickhouse/ClickHouseWriter.java b/sdks/java/io/clickhouse/src/main/java/org/apache/beam/sdk/io/clickhouse/ClickHouseWriter.java index c8c49a656e3be..09a6ced44d379 100644 --- a/sdks/java/io/clickhouse/src/main/java/org/apache/beam/sdk/io/clickhouse/ClickHouseWriter.java +++ b/sdks/java/io/clickhouse/src/main/java/org/apache/beam/sdk/io/clickhouse/ClickHouseWriter.java @@ -21,12 +21,12 @@ import com.clickhouse.client.ClickHousePipedOutputStream; import com.clickhouse.client.data.BinaryStreamUtils; import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.util.Collection; import java.util.List; import org.apache.beam.sdk.io.clickhouse.TableSchema.ColumnType; import org.apache.beam.sdk.values.Row; import org.apache.beam.sdk.values.RowWithStorage; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; import org.joda.time.Days; import org.joda.time.Instant; @@ -60,7 +60,7 @@ static void writeValue(ClickHouseOutputStream stream, ColumnType columnType, Obj byte[] bytes; if (value instanceof String) { - bytes = ((String) value).getBytes(Charsets.UTF_8); + bytes = ((String) value).getBytes(StandardCharsets.UTF_8); } else { bytes = ((byte[]) value); } diff --git a/sdks/java/io/contextualtextio/src/test/java/org/apache/beam/sdk/io/contextualtextio/ContextualTextIOTest.java b/sdks/java/io/contextualtextio/src/test/java/org/apache/beam/sdk/io/contextualtextio/ContextualTextIOTest.java index 48904cae430f1..2cc89a2a0dc19 100644 --- a/sdks/java/io/contextualtextio/src/test/java/org/apache/beam/sdk/io/contextualtextio/ContextualTextIOTest.java +++ b/sdks/java/io/contextualtextio/src/test/java/org/apache/beam/sdk/io/contextualtextio/ContextualTextIOTest.java @@ -44,6 +44,7 @@ import java.io.OutputStream; import java.io.PrintStream; import java.io.Writer; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; @@ -87,7 +88,6 @@ import org.apache.beam.sdk.util.CoderUtils; import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.Row; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Joiner; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; @@ -524,7 +524,7 @@ public String createFileFromList(List input) throws Exception { try (PrintStream writer = new PrintStream(new FileOutputStream(tmpFile))) { for (String elem : input) { byte[] encodedElem = CoderUtils.encodeToByteArray(StringUtf8Coder.of(), elem); - String line = new String(encodedElem, Charsets.UTF_8); + String line = new String(encodedElem, StandardCharsets.UTF_8); writer.println(line); } } @@ -548,7 +548,7 @@ public void multipleFilesTest() throws Exception { for (int lineNum = 0; lineNum < numLines; ++lineNum) { String elem = filename + " " + lineNum; byte[] encodedElem = CoderUtils.encodeToByteArray(StringUtf8Coder.of(), elem); - String line = new String(encodedElem, Charsets.UTF_8); + String line = new String(encodedElem, StandardCharsets.UTF_8); writer.println(line); } } @@ -588,7 +588,7 @@ public void testWithHintMatchesManyFiles() throws IOException { for (int lineNum = 0; lineNum < 10 + num; ++lineNum) { String elem = filename + " " + lineNum; byte[] encodedElem = CoderUtils.encodeToByteArray(StringUtf8Coder.of(), elem); - String line = new String(encodedElem, Charsets.UTF_8); + String line = new String(encodedElem, StandardCharsets.UTF_8); writer.println(line); } } @@ -817,7 +817,7 @@ private void runTestRead(String[] expected) throws Exception { try (PrintStream writer = new PrintStream(new FileOutputStream(tmpFile))) { for (String elem : expected) { byte[] encodedElem = CoderUtils.encodeToByteArray(StringUtf8Coder.of(), elem); - String line = new String(encodedElem, Charsets.UTF_8); + String line = new String(encodedElem, StandardCharsets.UTF_8); writer.println(line); } } @@ -840,7 +840,7 @@ private void runTestReadLineNumsAndFileName(String[] expected) throws Exception int lineNum = 0; for (String elem : expected) { byte[] encodedElem = CoderUtils.encodeToByteArray(StringUtf8Coder.of(), elem); - String line = new String(encodedElem, Charsets.UTF_8); + String line = new String(encodedElem, StandardCharsets.UTF_8); writer.println(line); actualExpected.add(lineNum + " " + filePath + " " + line); lineNum++; @@ -1121,7 +1121,7 @@ public void testProgressEmptyFile() throws IOException { public void testProgressTextFile() throws IOException { String file = "line1\nline2\nline3"; try (BoundedSource.BoundedReader reader = - prepareSource(file.getBytes(Charsets.UTF_8)) + prepareSource(file.getBytes(StandardCharsets.UTF_8)) .createReader(PipelineOptionsFactory.create())) { // Check preconditions before starting assertEquals(0.0, reader.getFractionConsumed(), 1e-6); @@ -1157,7 +1157,7 @@ public void testProgressTextFile() throws IOException { @Test public void testProgressAfterSplitting() throws IOException { String file = "line1\nline2\nline3"; - BoundedSource source = prepareSource(file.getBytes(Charsets.UTF_8)); + BoundedSource source = prepareSource(file.getBytes(StandardCharsets.UTF_8)); BoundedSource remainder; // Create the remainder, verifying properties pre- and post-splitting. diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubReadSchemaTransformProviderTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubReadSchemaTransformProviderTest.java index dd5a9abd5ac8e..98aade888a33d 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubReadSchemaTransformProviderTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubReadSchemaTransformProviderTest.java @@ -25,6 +25,7 @@ import com.google.protobuf.Timestamp; import java.io.IOException; import java.io.Serializable; +import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.HashMap; import java.util.List; @@ -47,7 +48,6 @@ import org.apache.beam.sdk.testing.TestPipeline; import org.apache.beam.sdk.values.PCollectionRowTuple; import org.apache.beam.sdk.values.Row; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.junit.Rule; @@ -170,7 +170,7 @@ public void testReadRaw() throws IOException { PCollectionRowTuple begin = PCollectionRowTuple.empty(p); Schema rawSchema = Schema.of(Schema.Field.of("payload", Schema.FieldType.BYTES)); - byte[] payload = "some payload".getBytes(Charsets.UTF_8); + byte[] payload = "some payload".getBytes(StandardCharsets.UTF_8); try (PubsubTestClientFactory clientFactory = clientFactory(ImmutableList.of(incomingMessageOf(payload, CLOCK.currentTimeMillis())))) { @@ -211,7 +211,7 @@ public void testReadAttributes() throws IOException { .addStringField("attr") .addMapField("attrMap", Schema.FieldType.STRING, Schema.FieldType.STRING) .build(); - byte[] payload = "some payload".getBytes(Charsets.UTF_8); + byte[] payload = "some payload".getBytes(StandardCharsets.UTF_8); String attr = "attr value"; try (PubsubTestClientFactory clientFactory = diff --git a/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/ReadFromKafkaDoFnTest.java b/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/ReadFromKafkaDoFnTest.java index 612b20393d789..6ee3d9d96ef68 100644 --- a/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/ReadFromKafkaDoFnTest.java +++ b/sdks/java/io/kafka/src/test/java/org/apache/beam/sdk/io/kafka/ReadFromKafkaDoFnTest.java @@ -21,6 +21,7 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; +import java.nio.charset.StandardCharsets; import java.time.Duration; import java.util.ArrayList; import java.util.Collection; @@ -57,7 +58,6 @@ import org.apache.beam.sdk.values.Row; import org.apache.beam.sdk.values.TupleTag; import org.apache.beam.sdk.values.TypeDescriptor; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; @@ -257,8 +257,8 @@ public synchronized ConsumerRecords poll(Duration timeout) { topicPartition.topic(), topicPartition.partition(), startOffset + i, - key.getBytes(Charsets.UTF_8), - value.getBytes(Charsets.UTF_8))); + key.getBytes(StandardCharsets.UTF_8), + value.getBytes(StandardCharsets.UTF_8))); } if (records.isEmpty()) { return ConsumerRecords.empty(); diff --git a/sdks/java/testing/tpcds/src/main/java/org/apache/beam/sdk/tpcds/QueryReader.java b/sdks/java/testing/tpcds/src/main/java/org/apache/beam/sdk/tpcds/QueryReader.java index 4983d52a642f5..8071bad84d73b 100644 --- a/sdks/java/testing/tpcds/src/main/java/org/apache/beam/sdk/tpcds/QueryReader.java +++ b/sdks/java/testing/tpcds/src/main/java/org/apache/beam/sdk/tpcds/QueryReader.java @@ -17,11 +17,11 @@ */ package org.apache.beam.sdk.tpcds; +import java.nio.charset.StandardCharsets; import java.util.Set; import org.apache.beam.vendor.calcite.v1_28_0.org.apache.calcite.sql.SqlNode; import org.apache.beam.vendor.calcite.v1_28_0.org.apache.calcite.sql.parser.SqlParseException; import org.apache.beam.vendor.calcite.v1_28_0.org.apache.calcite.sql.parser.SqlParser; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.io.Resources; /** @@ -39,7 +39,7 @@ public class QueryReader { */ public static String readQuery(String queryFileName) throws Exception { String path = "queries/" + queryFileName + ".sql"; - return Resources.toString(Resources.getResource(path), Charsets.UTF_8); + return Resources.toString(Resources.getResource(path), StandardCharsets.UTF_8); } /** diff --git a/sdks/java/testing/tpcds/src/main/java/org/apache/beam/sdk/tpcds/SqlTransformRunner.java b/sdks/java/testing/tpcds/src/main/java/org/apache/beam/sdk/tpcds/SqlTransformRunner.java index 1550a25b7c8f1..6efb7e7e06598 100644 --- a/sdks/java/testing/tpcds/src/main/java/org/apache/beam/sdk/tpcds/SqlTransformRunner.java +++ b/sdks/java/testing/tpcds/src/main/java/org/apache/beam/sdk/tpcds/SqlTransformRunner.java @@ -21,6 +21,7 @@ import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull; import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; @@ -56,7 +57,6 @@ import org.apache.beam.sdk.values.TypeDescriptors; import org.apache.beam.vendor.calcite.v1_28_0.org.apache.calcite.sql.SqlIdentifier; import org.apache.beam.vendor.calcite.v1_28_0.org.apache.calcite.sql.util.SqlBasicVisitor; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.io.Resources; import org.apache.commons.csv.CSVFormat; @@ -212,7 +212,7 @@ private static PCollection getTableCSV( private static org.apache.avro.Schema getAvroSchema(String tableName) throws IOException { String path = "schemas_avro/" + tableName + ".json"; return new org.apache.avro.Schema.Parser() - .parse(Resources.toString(Resources.getResource(path), Charsets.UTF_8)); + .parse(Resources.toString(Resources.getResource(path), StandardCharsets.UTF_8)); } static org.apache.avro.Schema getProjectedSchema( diff --git a/sdks/java/testing/tpcds/src/main/java/org/apache/beam/sdk/tpcds/TableSchemaJSONLoader.java b/sdks/java/testing/tpcds/src/main/java/org/apache/beam/sdk/tpcds/TableSchemaJSONLoader.java index 485fa83a4a8ee..97116e14cdcd5 100644 --- a/sdks/java/testing/tpcds/src/main/java/org/apache/beam/sdk/tpcds/TableSchemaJSONLoader.java +++ b/sdks/java/testing/tpcds/src/main/java/org/apache/beam/sdk/tpcds/TableSchemaJSONLoader.java @@ -20,11 +20,11 @@ import static org.apache.beam.sdk.util.Preconditions.checkArgumentNotNull; import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Map; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.io.Resources; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.reflect.ClassPath; import org.json.simple.JSONArray; @@ -49,7 +49,7 @@ public class TableSchemaJSONLoader { @SuppressWarnings({"rawtypes", "DefaultCharset"}) public static String parseTableSchema(String tableName) throws Exception { String path = "schemas/" + tableName + ".json"; - String schema = Resources.toString(Resources.getResource(path), Charsets.UTF_8); + String schema = Resources.toString(Resources.getResource(path), StandardCharsets.UTF_8); JSONObject jsonObject = (JSONObject) new JSONParser().parse(schema); JSONArray jsonArray = (JSONArray) jsonObject.get("schema"); diff --git a/sdks/java/transform-service/launcher/src/test/java/org/apache/beam/sdk/transformservice/launcher/TransformServiceLauncherTest.java b/sdks/java/transform-service/launcher/src/test/java/org/apache/beam/sdk/transformservice/launcher/TransformServiceLauncherTest.java index a9ae5360a8598..b766d2b13a4bc 100644 --- a/sdks/java/transform-service/launcher/src/test/java/org/apache/beam/sdk/transformservice/launcher/TransformServiceLauncherTest.java +++ b/sdks/java/transform-service/launcher/src/test/java/org/apache/beam/sdk/transformservice/launcher/TransformServiceLauncherTest.java @@ -25,12 +25,12 @@ import java.io.IOException; import java.io.OutputStreamWriter; import java.io.Writer; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; import java.util.UUID; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Charsets; import org.junit.Assert; import org.junit.Test; import org.junit.runner.RunWith; @@ -77,7 +77,7 @@ public void testLauncherInstallsDependencies() throws IOException { try (Writer fout = new OutputStreamWriter( - new FileOutputStream(requirementsFile.getAbsolutePath()), Charsets.UTF_8)) { + new FileOutputStream(requirementsFile.getAbsolutePath()), StandardCharsets.UTF_8)) { fout.write("pypipackage1\n"); fout.write("pypipackage2\n"); } @@ -118,7 +118,7 @@ public void testLauncherInstallsLocalDependencies() throws IOException { dependency1.deleteOnExit(); try (Writer fout = new OutputStreamWriter( - new FileOutputStream(dependency1.getAbsolutePath()), Charsets.UTF_8)) { + new FileOutputStream(dependency1.getAbsolutePath()), StandardCharsets.UTF_8)) { fout.write("tempdata\n"); } @@ -128,7 +128,7 @@ public void testLauncherInstallsLocalDependencies() throws IOException { dependency2.deleteOnExit(); try (Writer fout = new OutputStreamWriter( - new FileOutputStream(dependency2.getAbsolutePath()), Charsets.UTF_8)) { + new FileOutputStream(dependency2.getAbsolutePath()), StandardCharsets.UTF_8)) { fout.write("tempdata\n"); } @@ -140,7 +140,7 @@ public void testLauncherInstallsLocalDependencies() throws IOException { requirementsFile.deleteOnExit(); try (Writer fout = new OutputStreamWriter( - new FileOutputStream(requirementsFile.getAbsolutePath()), Charsets.UTF_8)) { + new FileOutputStream(requirementsFile.getAbsolutePath()), StandardCharsets.UTF_8)) { fout.write(dependency1.getAbsolutePath() + "\n"); fout.write(dependency2.getAbsolutePath() + "\n"); fout.write("pypipackage" + "\n"); From 99a23830037f58178d3fdf9db22f27b4de37dac4 Mon Sep 17 00:00:00 2001 From: Damon Date: Tue, 6 Aug 2024 09:48:47 -0700 Subject: [PATCH 16/78] Enable artifact staging during Prism Runner lifecycle (#32084) --- runners/prism/java/build.gradle | 1 + .../runners/prism/PrismArtifactStager.java | 173 ++++++++++++++++++ .../prism/PrismArtifactStagerTest.java | 143 +++++++++++++++ 3 files changed, 317 insertions(+) create mode 100644 runners/prism/java/src/main/java/org/apache/beam/runners/prism/PrismArtifactStager.java create mode 100644 runners/prism/java/src/test/java/org/apache/beam/runners/prism/PrismArtifactStagerTest.java diff --git a/runners/prism/java/build.gradle b/runners/prism/java/build.gradle index 2b0635ca61255..96ab4e70a5792 100644 --- a/runners/prism/java/build.gradle +++ b/runners/prism/java/build.gradle @@ -30,6 +30,7 @@ dependencies { implementation project(path: ":model:pipeline", configuration: "shadow") implementation project(path: ":sdks:java:core", configuration: "shadow") implementation project(path: ":sdks:java:harness", configuration: "shadow") + implementation project(":runners:java-fn-execution") implementation project(":runners:portability:java") implementation library.java.joda_time diff --git a/runners/prism/java/src/main/java/org/apache/beam/runners/prism/PrismArtifactStager.java b/runners/prism/java/src/main/java/org/apache/beam/runners/prism/PrismArtifactStager.java new file mode 100644 index 0000000000000..f1d99a213eea3 --- /dev/null +++ b/runners/prism/java/src/main/java/org/apache/beam/runners/prism/PrismArtifactStager.java @@ -0,0 +1,173 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.prism; + +import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkState; + +import com.google.auto.value.AutoValue; +import java.util.Optional; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeUnit; +import org.apache.beam.model.jobmanagement.v1.ArtifactStagingServiceGrpc; +import org.apache.beam.model.jobmanagement.v1.JobApi; +import org.apache.beam.model.jobmanagement.v1.JobServiceGrpc; +import org.apache.beam.model.pipeline.v1.Endpoints; +import org.apache.beam.runners.fnexecution.artifact.ArtifactRetrievalService; +import org.apache.beam.runners.fnexecution.artifact.ArtifactStagingService; +import org.apache.beam.sdk.fn.channel.ManagedChannelFactory; +import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.ManagedChannel; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Stages {@link org.apache.beam.model.pipeline.v1.RunnerApi.Pipeline} artifacts of prepared jobs. + */ +@AutoValue +abstract class PrismArtifactStager implements AutoCloseable { + + private static final Logger LOG = LoggerFactory.getLogger(PrismArtifactStager.class); + + /** + * Instantiate a {@link PrismArtifactStager} via call to {@link #of(String, String)}, assigning + * {@link Builder#setStagingEndpoint} using {@param prepareJobResponse} {@link + * JobApi.PrepareJobResponse#getArtifactStagingEndpoint} and {@link + * JobApi.PrepareJobResponse#getStagingSessionToken}. + */ + static PrismArtifactStager of(JobApi.PrepareJobResponse prepareJobResponse) { + return of( + prepareJobResponse.getArtifactStagingEndpoint().getUrl(), + prepareJobResponse.getStagingSessionToken()); + } + + /** + * Instantiates a {@link PrismArtifactStager} from the {@param stagingEndpoint} URL and {@param + * stagingSessionToken} to instantiate the {@link #getRetrievalService}, {@link + * #getManagedChannel}, and {@link #getStagingServiceStub} defaults. See the referenced getters + * for more details. + */ + static PrismArtifactStager of(String stagingEndpoint, String stagingSessionToken) { + return PrismArtifactStager.builder() + .setStagingEndpoint(stagingEndpoint) + .setStagingSessionToken(stagingSessionToken) + .build(); + } + + static Builder builder() { + return new AutoValue_PrismArtifactStager.Builder(); + } + + /** + * Stage the {@link org.apache.beam.model.pipeline.v1.RunnerApi.Pipeline} artifacts via {@link + * ArtifactStagingService#offer} supplying {@link #getRetrievalService}, {@link + * #getStagingServiceStub}, and {@link #getStagingSessionToken}. + */ + void stage() throws ExecutionException, InterruptedException { + LOG.info("staging artifacts at {}", getStagingEndpoint()); + ArtifactStagingService.offer( + getRetrievalService(), getStagingServiceStub(), getStagingSessionToken()); + } + + /** The URL of the {@link ArtifactStagingService}. */ + abstract String getStagingEndpoint(); + + /** + * Token associated with a staging session and acquired from a {@link + * JobServiceGrpc.JobServiceStub#prepare}'s {@link JobApi.PrepareJobResponse}. + */ + abstract String getStagingSessionToken(); + + /** + * The service that retrieves artifacts; defaults to instantiating from the default {@link + * ArtifactRetrievalService#ArtifactRetrievalService()} constructor. + */ + abstract ArtifactRetrievalService getRetrievalService(); + + /** + * Used to instantiate the {@link #getStagingServiceStub}. By default, instantiates using {@link + * ManagedChannelFactory#forDescriptor(Endpoints.ApiServiceDescriptor)}, where {@link + * Endpoints.ApiServiceDescriptor} is instantiated via {@link + * Endpoints.ApiServiceDescriptor.Builder#setUrl(String)} and the URL provided by {@link + * #getStagingEndpoint}. + */ + abstract ManagedChannel getManagedChannel(); + + /** + * Required by {@link ArtifactStagingService#offer}. By default, instantiates using {@link + * ArtifactStagingServiceGrpc#newStub} and {@link #getManagedChannel}. + */ + abstract ArtifactStagingServiceGrpc.ArtifactStagingServiceStub getStagingServiceStub(); + + @Override + public void close() { + LOG.info("shutting down {}", PrismArtifactStager.class); + getRetrievalService().close(); + getManagedChannel().shutdown(); + try { + getManagedChannel().awaitTermination(3000L, TimeUnit.MILLISECONDS); + } catch (InterruptedException ignored) { + } + } + + @AutoValue.Builder + abstract static class Builder { + + abstract Builder setStagingEndpoint(String stagingEndpoint); + + abstract Optional getStagingEndpoint(); + + abstract Builder setStagingSessionToken(String stagingSessionToken); + + abstract Builder setRetrievalService(ArtifactRetrievalService retrievalService); + + abstract Optional getRetrievalService(); + + abstract Builder setManagedChannel(ManagedChannel managedChannel); + + abstract Optional getManagedChannel(); + + abstract Builder setStagingServiceStub( + ArtifactStagingServiceGrpc.ArtifactStagingServiceStub stub); + + abstract Optional + getStagingServiceStub(); + + abstract PrismArtifactStager autoBuild(); + + final PrismArtifactStager build() { + + checkState(getStagingEndpoint().isPresent(), "missing staging endpoint"); + ManagedChannelFactory channelFactory = ManagedChannelFactory.createDefault(); + + if (!getManagedChannel().isPresent()) { + Endpoints.ApiServiceDescriptor descriptor = + Endpoints.ApiServiceDescriptor.newBuilder().setUrl(getStagingEndpoint().get()).build(); + setManagedChannel(channelFactory.forDescriptor(descriptor)); + } + + if (!getStagingServiceStub().isPresent()) { + setStagingServiceStub(ArtifactStagingServiceGrpc.newStub(getManagedChannel().get())); + } + + if (!getRetrievalService().isPresent()) { + setRetrievalService(new ArtifactRetrievalService()); + } + + return autoBuild(); + } + } +} diff --git a/runners/prism/java/src/test/java/org/apache/beam/runners/prism/PrismArtifactStagerTest.java b/runners/prism/java/src/test/java/org/apache/beam/runners/prism/PrismArtifactStagerTest.java new file mode 100644 index 0000000000000..d3ac8a72eafb9 --- /dev/null +++ b/runners/prism/java/src/test/java/org/apache/beam/runners/prism/PrismArtifactStagerTest.java @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.prism; + +import static com.google.common.truth.Truth.assertThat; +import static org.apache.beam.runners.fnexecution.artifact.ArtifactRetrievalService.EMBEDDED_ARTIFACT_URN; +import static org.junit.Assert.assertThrows; + +import java.io.IOException; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ExecutionException; +import org.apache.beam.model.pipeline.v1.RunnerApi; +import org.apache.beam.runners.fnexecution.artifact.ArtifactRetrievalService; +import org.apache.beam.runners.fnexecution.artifact.ArtifactStagingService; +import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; +import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.ManagedChannel; +import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.inprocess.InProcessChannelBuilder; +import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.inprocess.InProcessServerBuilder; +import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.testing.GrpcCleanupRule; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; +import org.apache.commons.io.output.ByteArrayOutputStream; +import org.junit.Rule; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +/** Tests for {@link PrismArtifactStager}. */ +@RunWith(JUnit4.class) +public class PrismArtifactStagerTest { + + @Rule public final GrpcCleanupRule grpcCleanup = new GrpcCleanupRule(); + + final ArtifactStagingService stagingService = + new ArtifactStagingService(new TestDestinationProvider()); + + @Test + public void givenValidArtifacts_stages() + throws IOException, ExecutionException, InterruptedException { + PrismArtifactStager underTest = prismArtifactStager(validArtifacts()); + assertThat(underTest.getManagedChannel().isShutdown()).isFalse(); + underTest.stage(); + assertThat(stagingService.getStagedArtifacts(underTest.getStagingSessionToken())).isNotEmpty(); + underTest.close(); + assertThat(underTest.getManagedChannel().isShutdown()).isTrue(); + } + + @Test + public void givenErrors_performsGracefulCleanup() throws IOException { + PrismArtifactStager underTest = prismArtifactStager(invalidArtifacts()); + assertThat(underTest.getManagedChannel().isShutdown()).isFalse(); + ExecutionException error = assertThrows(ExecutionException.class, underTest::stage); + assertThat(error.getMessage()).contains("Unexpected artifact type: invalid-type-urn"); + assertThat(underTest.getManagedChannel().isShutdown()).isFalse(); + underTest.close(); + assertThat(underTest.getManagedChannel().isShutdown()).isTrue(); + } + + private PrismArtifactStager prismArtifactStager( + Map> artifacts) throws IOException { + String serverName = InProcessServerBuilder.generateName(); + ArtifactRetrievalService retrievalService = new ArtifactRetrievalService(); + String stagingToken = "staging-token"; + stagingService.registerJob(stagingToken, artifacts); + + grpcCleanup.register( + InProcessServerBuilder.forName(serverName) + .directExecutor() + .addService(stagingService) + .addService(retrievalService) + .build() + .start()); + + ManagedChannel channel = + grpcCleanup.register(InProcessChannelBuilder.forName(serverName).build()); + + return PrismArtifactStager.builder() + .setStagingEndpoint("ignore") + .setStagingSessionToken(stagingToken) + .setManagedChannel(channel) + .build(); + } + + private Map> validArtifacts() { + return ImmutableMap.of( + "env1", + Collections.singletonList( + RunnerApi.ArtifactInformation.newBuilder() + .setTypeUrn(EMBEDDED_ARTIFACT_URN) + .setTypePayload( + RunnerApi.EmbeddedFilePayload.newBuilder() + .setData(ByteString.copyFromUtf8("type-payload")) + .build() + .toByteString()) + .setRoleUrn("role-urn") + .build())); + } + + private Map> invalidArtifacts() { + return ImmutableMap.of( + "env1", + Collections.singletonList( + RunnerApi.ArtifactInformation.newBuilder() + .setTypeUrn("invalid-type-urn") + .setTypePayload( + RunnerApi.EmbeddedFilePayload.newBuilder() + .setData(ByteString.copyFromUtf8("type-payload")) + .build() + .toByteString()) + .setRoleUrn("role-urn") + .build())); + } + + private static class TestDestinationProvider + implements ArtifactStagingService.ArtifactDestinationProvider { + + @Override + public ArtifactStagingService.ArtifactDestination getDestination( + String stagingToken, String name) throws IOException { + return ArtifactStagingService.ArtifactDestination.create( + EMBEDDED_ARTIFACT_URN, ByteString.EMPTY, new ByteArrayOutputStream()); + } + + @Override + public void removeStagedArtifacts(String stagingToken) throws IOException {} + } +} From 741facf00993f24cca9418078ac62ff53c28e04e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 6 Aug 2024 11:18:56 -0700 Subject: [PATCH 17/78] Bump github.com/docker/docker in /sdks (#32046) Bumps [github.com/docker/docker](https://github.com/docker/docker) from 25.0.5+incompatible to 25.0.6+incompatible. - [Release notes](https://github.com/docker/docker/releases) - [Commits](https://github.com/docker/docker/compare/v25.0.5...v25.0.6) --- updated-dependencies: - dependency-name: github.com/docker/docker dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- sdks/go.mod | 2 +- sdks/go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sdks/go.mod b/sdks/go.mod index 6d42e02296c78..1716a6e2d22dc 100644 --- a/sdks/go.mod +++ b/sdks/go.mod @@ -137,7 +137,7 @@ require ( github.com/cncf/xds/go v0.0.0-20240318125728-8a4994d93e50 // indirect github.com/containerd/containerd v1.7.11 // indirect github.com/cpuguy83/dockercfg v0.3.1 // indirect - github.com/docker/docker v25.0.5+incompatible // but required to resolve issue docker has with go1.20 + github.com/docker/docker v25.0.6+incompatible // but required to resolve issue docker has with go1.20 github.com/docker/go-units v0.5.0 // indirect github.com/envoyproxy/go-control-plane v0.12.0 // indirect github.com/envoyproxy/protoc-gen-validate v1.0.4 // indirect diff --git a/sdks/go.sum b/sdks/go.sum index 098f858488b72..a0b4738decc3d 100644 --- a/sdks/go.sum +++ b/sdks/go.sum @@ -768,8 +768,8 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/distribution/reference v0.5.0 h1:/FUIFXtfc/x2gpa5/VGfiGLuOIdYa1t65IKK2OFGvA0= github.com/distribution/reference v0.5.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E= -github.com/docker/docker v25.0.5+incompatible h1:UmQydMduGkrD5nQde1mecF/YnSbTOaPeFIeP5C4W+DE= -github.com/docker/docker v25.0.5+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= +github.com/docker/docker v25.0.6+incompatible h1:5cPwbwriIcsua2REJe8HqQV+6WlWc1byg2QSXzBxBGg= +github.com/docker/docker v25.0.6+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= github.com/docker/go-connections v0.5.0 h1:USnMq7hx7gwdVZq1L49hLXaFtUdTADjXGp+uj1Br63c= github.com/docker/go-connections v0.5.0/go.mod h1:ov60Kzw0kKElRwhNs9UlUHAE/F9Fe6GLaXnqyDdmEXc= github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4= From 5ab908b984d4144b5cbe584d7ed4ed7a4e226993 Mon Sep 17 00:00:00 2001 From: Yi Hu Date: Tue, 6 Aug 2024 15:03:57 -0400 Subject: [PATCH 18/78] Add Lineage metrics for BigtableIO (#32068) * Add Lineage metrics for BigtableIO * add tests * simplify metrics query logics; exclude test actually already failing * Address comments, fix typo --- .../org/apache/beam/sdk/metrics/Lineage.java | 43 ++++++++++++++++--- .../io/google-cloud-platform/build.gradle | 4 ++ .../beam/sdk/io/gcp/bigtable/BigtableIO.java | 14 ++++++ .../sdk/io/gcp/bigtable/BigtableService.java | 6 +++ .../io/gcp/bigtable/BigtableServiceImpl.java | 22 ++++++++++ .../io/gcp/bigquery/BigQueryIOReadTest.java | 23 ++-------- .../io/gcp/bigquery/BigQueryIOWriteTest.java | 13 +----- .../sdk/io/gcp/bigtable/BigtableReadIT.java | 21 ++++++++- .../sdk/io/gcp/bigtable/BigtableWriteIT.java | 18 +++++++- 9 files changed, 123 insertions(+), 41 deletions(-) diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/metrics/Lineage.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/metrics/Lineage.java index 7890a9f74b941..8b69b0ef55236 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/metrics/Lineage.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/metrics/Lineage.java @@ -17,17 +17,17 @@ */ package org.apache.beam.sdk.metrics; +import java.util.HashSet; +import java.util.Set; + /** * Standard collection of metrics used to record source and sinks information for lineage tracking. */ public class Lineage { - public static final String LINEAGE_NAMESPACE = "lineage"; - public static final String SOURCE_METRIC_NAME = "sources"; - public static final String SINK_METRIC_NAME = "sinks"; - - private static final StringSet SOURCES = Metrics.stringSet(LINEAGE_NAMESPACE, SOURCE_METRIC_NAME); - private static final StringSet SINKS = Metrics.stringSet(LINEAGE_NAMESPACE, SINK_METRIC_NAME); + private static final StringSet SOURCES = + Metrics.stringSet(LINEAGE_NAMESPACE, Type.SOURCE.toString()); + private static final StringSet SINKS = Metrics.stringSet(LINEAGE_NAMESPACE, Type.SINK.toString()); /** {@link StringSet} representing sources and optionally side inputs. */ public static StringSet getSources() { @@ -38,4 +38,35 @@ public static StringSet getSources() { public static StringSet getSinks() { return SINKS; } + + /** Query {@link StringSet} metrics from {@link MetricResults}. */ + public static Set query(MetricResults results, Type type) { + MetricsFilter filter = + MetricsFilter.builder() + .addNameFilter(MetricNameFilter.named(LINEAGE_NAMESPACE, type.toString())) + .build(); + Set result = new HashSet<>(); + for (MetricResult metrics : results.queryMetrics(filter).getStringSets()) { + result.addAll(metrics.getCommitted().getStringSet()); + result.addAll(metrics.getAttempted().getStringSet()); + } + return result; + } + + /** Lineage metrics resource types. */ + public enum Type { + SOURCE("source"), + SINK("sink"); + + private final String name; + + Type(String name) { + this.name = name; + } + + @Override + public String toString() { + return name; + } + } } diff --git a/sdks/java/io/google-cloud-platform/build.gradle b/sdks/java/io/google-cloud-platform/build.gradle index e499bae6fc64f..23c56f13a94c7 100644 --- a/sdks/java/io/google-cloud-platform/build.gradle +++ b/sdks/java/io/google-cloud-platform/build.gradle @@ -218,6 +218,10 @@ task integrationTest(type: Test, dependsOn: processTestResources) { useJUnit { excludeCategories "org.apache.beam.sdk.testing.UsesKms" + filter { + // https://github.com/apache/beam/issues/32071 + excludeTestsMatching 'org.apache.beam.sdk.io.gcp.bigtable.BigtableReadIT.testE2EBigtableSegmentRead' + } } } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableIO.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableIO.java index d78ae2cb6c578..6d20109e947ba 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableIO.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableIO.java @@ -1337,6 +1337,7 @@ private static class BigtableWriterFn private transient Set> badRecords = null; // Due to callback thread not supporting Beam metrics, Record pending metrics and report later. private transient long pendingThrottlingMsecs; + private transient boolean reportedLineage; // Assign serviceEntry in startBundle and clear it in tearDown. @Nullable private BigtableServiceEntry serviceEntry; @@ -1480,6 +1481,10 @@ public void finishBundle(FinishBundleContext c) throws Exception { throttlingMsecs.inc(excessTime); } } + if (!reportedLineage) { + bigtableWriter.reportLineage(); + reportedLineage = true; + } bigtableWriter = null; } @@ -1612,6 +1617,7 @@ public String toString() { private final BigtableConfig config; private final BigtableReadOptions readOptions; private @Nullable Long estimatedSizeBytes; + private transient boolean reportedLineage; private final BigtableServiceFactory.ConfigId configId; @@ -1989,6 +1995,13 @@ public List getRanges() { public ValueProvider getTableId() { return readOptions.getTableId(); } + + void reportLineageOnce(BigtableService.Reader reader) { + if (!reportedLineage) { + reader.reportLineage(); + reportedLineage = true; + } + } } private static class BigtableReader extends BoundedReader { @@ -2019,6 +2032,7 @@ true, makeByteKey(reader.getCurrentRow().getKey()))) || rangeTracker.markDone(); if (hasRecord) { ++recordsReturned; + source.reportLineageOnce(reader); } return hasRecord; } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableService.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableService.java index 261cc3ac081d8..50d8126999c4b 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableService.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableService.java @@ -57,6 +57,9 @@ CompletionStage writeRecord(KV * @throws IOException if there is an error closing the writer */ void close() throws IOException; + + /** Report Lineage metrics to runner. */ + default void reportLineage() {} } /** The interface of a class that reads from Cloud Bigtable. */ @@ -77,6 +80,9 @@ interface Reader { Row getCurrentRow() throws NoSuchElementException; void close(); + + /** Report Lineage metrics to runner. */ + default void reportLineage() {} } /** Returns a {@link Reader} that will read from the specified source. */ diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableServiceImpl.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableServiceImpl.java index f06a4a1276864..6fdf67722bac2 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableServiceImpl.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableServiceImpl.java @@ -71,6 +71,7 @@ import org.apache.beam.sdk.io.gcp.bigtable.BigtableIO.BigtableSource; import org.apache.beam.sdk.io.range.ByteKeyRange; import org.apache.beam.sdk.metrics.Distribution; +import org.apache.beam.sdk.metrics.Lineage; import org.apache.beam.sdk.metrics.Metrics; import org.apache.beam.sdk.values.KV; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; @@ -212,6 +213,11 @@ public void close() { exhausted = true; } } + + @Override + public void reportLineage() { + Lineage.getSources().add(String.format("bigtable:%s.%s.%s", projectId, instanceId, tableId)); + } } @VisibleForTesting @@ -225,6 +231,9 @@ static class BigtableSegmentReaderImpl implements Reader { private final int refillSegmentWaterMark; private final long maxSegmentByteSize; private ServiceCallMetric serviceCallMetric; + private final String projectId; + private final String instanceId; + private final String tableId; private static class UpstreamResults { private final List rows; @@ -308,11 +317,19 @@ static BigtableSegmentReaderImpl create( // Asynchronously refill buffer when there is 10% of the elements are left this.refillSegmentWaterMark = Math.max(1, (int) (request.getRowsLimit() * WATERMARK_PERCENTAGE)); + this.projectId = projectId; + this.instanceId = instanceId; + this.tableId = tableId; } @Override public void close() {} + @Override + public void reportLineage() { + Lineage.getSources().add(String.format("bigtable:%s.%s.%s", projectId, instanceId, tableId)); + } + @Override public boolean start() throws IOException { future = fetchNextSegment(); @@ -578,6 +595,11 @@ public void writeSingleRecord(KV> record) throws } } + @Override + public void reportLineage() { + Lineage.getSinks().add(String.format("bigtable:%s.%s.%s", projectId, instanceId, tableId)); + } + private ServiceCallMetric createServiceCallMetric() { // Populate metrics HashMap baseLabels = new HashMap<>(); diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOReadTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOReadTest.java index 5c43666e79e5c..a8aca7570b33d 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOReadTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOReadTest.java @@ -43,6 +43,7 @@ import java.util.List; import java.util.Objects; import java.util.Optional; +import java.util.Set; import java.util.concurrent.ExecutionException; import org.apache.avro.specific.SpecificDatumReader; import org.apache.avro.specific.SpecificRecordBase; @@ -61,9 +62,6 @@ import org.apache.beam.sdk.io.gcp.testing.FakeDatasetService; import org.apache.beam.sdk.io.gcp.testing.FakeJobService; import org.apache.beam.sdk.metrics.Lineage; -import org.apache.beam.sdk.metrics.MetricNameFilter; -import org.apache.beam.sdk.metrics.MetricQueryResults; -import org.apache.beam.sdk.metrics.MetricsFilter; import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.options.ValueProvider; @@ -351,18 +349,8 @@ private void checkTypedReadQueryObjectWithValidate( } private void checkLineageSourceMetric(PipelineResult pipelineResult, String tableName) { - MetricQueryResults lineageMetrics = - pipelineResult - .metrics() - .queryMetrics( - MetricsFilter.builder() - .addNameFilter( - MetricNameFilter.named( - Lineage.LINEAGE_NAMESPACE, Lineage.SOURCE_METRIC_NAME)) - .build()); - assertThat( - lineageMetrics.getStringSets().iterator().next().getCommitted().getStringSet(), - contains("bigquery:" + tableName.replace(':', '.'))); + Set result = Lineage.query(pipelineResult.metrics(), Lineage.Type.SOURCE); + assertThat(result, contains("bigquery:" + tableName.replace(':', '.'))); } @Before @@ -600,10 +588,7 @@ public void processElement(ProcessContext c) throws Exception { new MyData("b", 2L, bd1, bd2), new MyData("c", 3L, bd1, bd2))); PipelineResult result = p.run(); - // Skip when direct runner splits outside of a counters context. - if (useTemplateCompatibility) { - checkLineageSourceMetric(result, "non-executing-project:somedataset.sometable"); - } + checkLineageSourceMetric(result, "non-executing-project:somedataset.sometable"); } @Test diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOWriteTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOWriteTest.java index bc90d4c8bae79..c5af8045bfe20 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOWriteTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOWriteTest.java @@ -118,9 +118,6 @@ import org.apache.beam.sdk.io.gcp.testing.FakeDatasetService; import org.apache.beam.sdk.io.gcp.testing.FakeJobService; import org.apache.beam.sdk.metrics.Lineage; -import org.apache.beam.sdk.metrics.MetricNameFilter; -import org.apache.beam.sdk.metrics.MetricQueryResults; -import org.apache.beam.sdk.metrics.MetricsFilter; import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.sdk.options.ValueProvider; import org.apache.beam.sdk.schemas.JavaFieldSchema; @@ -285,16 +282,8 @@ public void evaluate() throws Throwable { .withJobService(fakeJobService); private void checkLineageSinkMetric(PipelineResult pipelineResult, String tableName) { - MetricQueryResults lineageMetrics = - pipelineResult - .metrics() - .queryMetrics( - MetricsFilter.builder() - .addNameFilter( - MetricNameFilter.named(Lineage.LINEAGE_NAMESPACE, Lineage.SINK_METRIC_NAME)) - .build()); assertThat( - lineageMetrics.getStringSets().iterator().next().getCommitted().getStringSet(), + Lineage.query(pipelineResult.metrics(), Lineage.Type.SINK), hasItem("bigquery:" + tableName.replace(':', '.'))); } diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableReadIT.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableReadIT.java index bc88858ebc33e..4ce9ad10b2c06 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableReadIT.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableReadIT.java @@ -17,6 +17,9 @@ */ package org.apache.beam.sdk.io.gcp.bigtable; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.hasItem; + import com.google.cloud.bigtable.admin.v2.BigtableTableAdminClient; import com.google.cloud.bigtable.admin.v2.BigtableTableAdminSettings; import com.google.cloud.bigtable.admin.v2.models.CreateTableRequest; @@ -28,7 +31,9 @@ import java.util.Date; import org.apache.beam.repackaged.core.org.apache.commons.lang3.StringUtils; import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.PipelineResult; import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; +import org.apache.beam.sdk.metrics.Lineage; import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.testing.PAssert; import org.apache.beam.sdk.testing.TestPipeline; @@ -110,7 +115,8 @@ public void testE2EBigtableRead() { p.apply(BigtableIO.read().withBigtableOptions(bigtableOptionsBuilder).withTableId(tableId)) .apply(Count.globally()); PAssert.thatSingleton(count).isEqualTo(numRows); - p.run(); + PipelineResult r = p.run(); + checkLineageSourceMetric(r, tableId); } @Test @@ -138,6 +144,17 @@ public void testE2EBigtableSegmentRead() { .withMaxBufferElementCount(10)) .apply(Count.globally()); PAssert.thatSingleton(count).isEqualTo(numRows); - p.run(); + PipelineResult r = p.run(); + checkLineageSourceMetric(r, tableId); + } + + private void checkLineageSourceMetric(PipelineResult r, String tableId) { + // TODO(https://github.com/apache/beam/issues/32071) test malformed, + // when pipeline.run() is non-blocking, the metrics are not available by the time of query + if (options.getRunner().getName().contains("DirectRunner")) { + assertThat( + Lineage.query(r.metrics(), Lineage.Type.SOURCE), + hasItem(String.format("bigtable:%s.%s.%s", project, options.getInstanceId(), tableId))); + } } } diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableWriteIT.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableWriteIT.java index bf9f7d991fa24..46bb3df836e56 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableWriteIT.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigtable/BigtableWriteIT.java @@ -18,6 +18,7 @@ package org.apache.beam.sdk.io.gcp.bigtable; import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.hasItem; import static org.junit.Assert.assertEquals; import com.google.api.gax.rpc.ServerStream; @@ -39,8 +40,10 @@ import java.util.Objects; import java.util.stream.Collectors; import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.PipelineResult; import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; import org.apache.beam.sdk.io.GenerateSequence; +import org.apache.beam.sdk.metrics.Lineage; import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.options.ValueProvider; import org.apache.beam.sdk.testing.PAssert; @@ -142,7 +145,7 @@ public void processElement(ProcessContext c) { .withProjectId(project) .withInstanceId(options.getInstanceId()) .withTableId(tableId)); - p.run(); + PipelineResult r = p.run(); // Test number of column families and column family name equality Table table = getTable(tableId); @@ -154,6 +157,7 @@ public void processElement(ProcessContext c) { // Test table data equality List> tableData = getTableData(tableId); assertThat(tableData, Matchers.containsInAnyOrder(testData.toArray())); + checkLineageSinkMetric(r, tableId); } @Test @@ -340,7 +344,7 @@ public void failureTest(int numRows, DoFn> tableData = getTableData(tableId); assertEquals(998, tableData.size()); + checkLineageSinkMetric(r, tableId); } @After @@ -412,4 +417,13 @@ private void deleteTable(String tableId) { tableAdminClient.deleteTable(tableId); } } + + private void checkLineageSinkMetric(PipelineResult r, String tableId) { + // Only check lineage metrics on direct runner until Dataflow runner v2 supported report back + if (options.getRunner().getName().contains("DirectRunner")) { + assertThat( + Lineage.query(r.metrics(), Lineage.Type.SINK), + hasItem(String.format("bigtable:%s.%s.%s", project, options.getInstanceId(), tableId))); + } + } } From 17283bb8294f22edfc4d00c49bf3d9a518a1551b Mon Sep 17 00:00:00 2001 From: Yi Hu Date: Tue, 6 Aug 2024 15:35:16 -0400 Subject: [PATCH 19/78] Add Lineage metrics to PubsubIO (#32037) * Add Lineage metrics to PubsubIO * fix format and add test * make getDatacatalogname fail safe --- .../beam/sdk/io/gcp/pubsub/PubsubClient.java | 28 +++++++++++++++++++ .../beam/sdk/io/gcp/pubsub/PubsubIO.java | 9 ++++++ .../io/gcp/pubsub/PubsubUnboundedSink.java | 13 +++++++++ .../io/gcp/pubsub/PubsubUnboundedSource.java | 14 ++++++++++ .../sdk/io/gcp/pubsub/PubsubClientTest.java | 2 ++ .../beam/sdk/io/gcp/pubsub/PubsubIOTest.java | 3 ++ 6 files changed, 69 insertions(+) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubClient.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubClient.java index 79a9bb7f07d64..f66ee6e1d8425 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubClient.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubClient.java @@ -39,12 +39,15 @@ import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Splitter; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.checkerframework.checker.nullness.qual.Nullable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** An (abstract) helper class for talking to Pubsub via an underlying transport. */ @SuppressWarnings({ "nullness" // TODO(https://github.com/apache/beam/issues/20497) }) public abstract class PubsubClient implements Closeable { + private static final Logger LOG = LoggerFactory.getLogger(PubsubClient.class); private static final Map> schemaTypeToConversionFnMap = ImmutableMap.of( @@ -257,6 +260,10 @@ public String getFullPath() { return String.format("/subscriptions/%s/%s", projectId, subscriptionName); } + public String getDataCatalogName() { + return String.format("pubsub:subscription:%s.%s", projectId, subscriptionName); + } + @Override public boolean equals(@Nullable Object o) { if (this == o) { @@ -293,6 +300,7 @@ public static SubscriptionPath subscriptionPathFromName( /** Path representing a Pubsub topic. */ public static class TopicPath implements Serializable { + // Format: "projects//topics/" private final String path; TopicPath(String path) { @@ -310,6 +318,26 @@ public String getName() { return splits.get(3); } + /** + * Returns the data catalog name. Format "pubsub:topic:`project`.`topic`" This method is + * fail-safe. If topic path is malformed, it returns an empty string. + */ + public String getDataCatalogName() { + List splits = Splitter.on('/').splitToList(path); + if (splits.size() == 4) { + // well-formed path + return String.format("pubsub:topic:%s.%s", splits.get(1), splits.get(3)); + } else { + // Mal-formed path. It is either a test fixture or user error and will fail on publish. + // We do not throw exception instead return empty string here. + LOG.warn( + "Cannot get data catalog name for malformed topic path {}. Expected format: " + + "projects//topics/", + path); + return ""; + } + } + public String getFullPath() { List splits = Splitter.on('/').splitToList(path); checkState(splits.size() == 4, "Malformed topic path %s", path); diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java index 6233cf6690801..0fd4e9207d81a 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java @@ -49,6 +49,7 @@ import org.apache.beam.sdk.io.gcp.pubsub.PubsubClient.OutgoingMessage; import org.apache.beam.sdk.io.gcp.pubsub.PubsubClient.SubscriptionPath; import org.apache.beam.sdk.io.gcp.pubsub.PubsubClient.TopicPath; +import org.apache.beam.sdk.metrics.Lineage; import org.apache.beam.sdk.options.ValueProvider; import org.apache.beam.sdk.options.ValueProvider.NestedValueProvider; import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider; @@ -512,6 +513,10 @@ public String asPath() { } } + public String dataCatalogName() { + return String.format("pubsub:topic:%s.%s", project, topic); + } + @Override public String toString() { return asPath(); @@ -1617,6 +1622,10 @@ public void finishBundle() throws IOException { for (Map.Entry entry : output.entrySet()) { publish(entry.getKey(), entry.getValue().messages); } + // Report lineage for all topics seen + for (PubsubTopic topic : output.keySet()) { + Lineage.getSinks().add(topic.dataCatalogName()); + } output = null; pubsubClient.close(); pubsubClient = null; diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java index aa8e3a4114868..defea87e835a8 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java @@ -41,6 +41,7 @@ import org.apache.beam.sdk.io.gcp.pubsub.PubsubClient.PubsubClientFactory; import org.apache.beam.sdk.io.gcp.pubsub.PubsubClient.TopicPath; import org.apache.beam.sdk.metrics.Counter; +import org.apache.beam.sdk.metrics.Lineage; import org.apache.beam.sdk.metrics.Metrics; import org.apache.beam.sdk.metrics.SinkMetrics; import org.apache.beam.sdk.options.ValueProvider; @@ -69,6 +70,7 @@ import org.apache.beam.sdk.values.TypeDescriptors; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Strings; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.hash.Hashing; import org.checkerframework.checker.nullness.qual.Nullable; import org.joda.time.Duration; @@ -231,6 +233,9 @@ private static class WriterFn extends DoFn, Void> { /** Client on which to talk to Pubsub. Null until created by {@link #startBundle}. */ private transient @Nullable PubsubClient pubsubClient; + /** Last TopicPath that reported Lineage. */ + private transient @Nullable TopicPath reportedLineage; + private final Counter batchCounter = Metrics.counter(WriterFn.class, "batches"); private final Counter elementCounter = SinkMetrics.elementsWritten(); private final Counter byteCounter = SinkMetrics.bytesWritten(); @@ -290,6 +295,14 @@ private void publishBatch(List messages, int bytes) throws IOEx batchCounter.inc(); elementCounter.inc(messages.size()); byteCounter.inc(bytes); + // Report Lineage multiple once for same topic + if (!topicPath.equals(reportedLineage)) { + String name = topicPath.getDataCatalogName(); + if (!Strings.isNullOrEmpty(name)) { + Lineage.getSinks().add(topicPath.getDataCatalogName()); + } + reportedLineage = topicPath; + } } @StartBundle diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSource.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSource.java index b9a554d54ade7..b131b521c067e 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSource.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSource.java @@ -56,6 +56,7 @@ import org.apache.beam.sdk.io.gcp.pubsub.PubsubClient.TopicPath; import org.apache.beam.sdk.io.gcp.pubsub.PubsubMessages.DeserializeBytesIntoPubsubMessagePayloadOnly; import org.apache.beam.sdk.metrics.Counter; +import org.apache.beam.sdk.metrics.Lineage; import org.apache.beam.sdk.metrics.SourceMetrics; import org.apache.beam.sdk.options.ExperimentalOptions; import org.apache.beam.sdk.options.PipelineOptions; @@ -1041,6 +1042,19 @@ public List split(int desiredNumSplits, PipelineOptions options) splitSource = new PubsubSource( outer, StaticValueProvider.of(outer.createRandomSubscription(options))); + TopicPath topic = outer.getTopic(); + if (topic != null) { + // is initial split on Read.fromTopic, report Lineage based on topic + Lineage.getSources().add(topic.getDataCatalogName()); + } + } else { + if (subscriptionPath.equals(outer.getSubscriptionProvider())) { + SubscriptionPath sub = subscriptionPath.get(); + if (sub != null) { + // is a split on Read.fromSubscription + Lineage.getSources().add(sub.getDataCatalogName()); + } + } } for (int i = 0; i < desiredNumSplits * SCALE_OUT; i++) { // Since the source is immutable and Pubsub automatically shards we simply diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubClientTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubClientTest.java index 895ed35bfb120..fb007d1171db1 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubClientTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubClientTest.java @@ -171,6 +171,7 @@ public void subscriptionPathFromNameWellFormed() { SubscriptionPath path = PubsubClient.subscriptionPathFromName("test", "something"); assertEquals("projects/test/subscriptions/something", path.getPath()); assertEquals("/subscriptions/test/something", path.getFullPath()); + assertEquals("pubsub:subscription:test.something", path.getDataCatalogName()); } @Test @@ -178,6 +179,7 @@ public void topicPathFromNameWellFormed() { TopicPath path = PubsubClient.topicPathFromName("test", "something"); assertEquals("projects/test/topics/something", path.getPath()); assertEquals("/topics/test/something", path.getFullPath()); + assertEquals("pubsub:topic:test.something", path.getDataCatalogName()); } @Test diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIOTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIOTest.java index 3027db6aee9d5..74a98f0b8b438 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIOTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIOTest.java @@ -237,6 +237,9 @@ public void testValueProviderTopic() { assertThat(pubsubRead.getTopicProvider(), not(nullValue())); assertThat(pubsubRead.getTopicProvider().isAccessible(), is(true)); assertThat(pubsubRead.getTopicProvider().get().asPath(), equalTo(provider.get())); + assertThat( + pubsubRead.getTopicProvider().get().dataCatalogName(), + equalTo("pubsub:topic:project.topic")); } @Test From e3e4454457762c85ca7c8068f0e9f2e20966dccc Mon Sep 17 00:00:00 2001 From: Robert Burke Date: Tue, 6 Aug 2024 12:54:31 -0700 Subject: [PATCH 20/78] [#32085][prism] Fix session windowing. (#32086) --- .../runners/prism/internal/handlerunner.go | 24 ++++++++++++------- .../runners/portability/prism_runner_test.py | 22 +++++++++++++++++ 2 files changed, 37 insertions(+), 9 deletions(-) diff --git a/sdks/go/pkg/beam/runners/prism/internal/handlerunner.go b/sdks/go/pkg/beam/runners/prism/internal/handlerunner.go index a1eeeba02c4bb..eecebde3d693f 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/handlerunner.go +++ b/sdks/go/pkg/beam/runners/prism/internal/handlerunner.go @@ -244,7 +244,7 @@ func (h *runner) ExecuteTransform(stageID, tid string, t *pipepb.PTransform, com kc := coders[kcID] ec := coders[ecID] - data = append(data, gbkBytes(ws, wc, kc, ec, inputData, coders, watermark)) + data = append(data, gbkBytes(ws, wc, kc, ec, inputData, coders)) if len(data[0]) == 0 { panic("no data for GBK") } @@ -290,7 +290,7 @@ func windowingStrategy(comps *pipepb.Components, tid string) *pipepb.WindowingSt } // gbkBytes re-encodes gbk inputs in a gbk result. -func gbkBytes(ws *pipepb.WindowingStrategy, wc, kc, vc *pipepb.Coder, toAggregate [][]byte, coders map[string]*pipepb.Coder, watermark mtime.Time) []byte { +func gbkBytes(ws *pipepb.WindowingStrategy, wc, kc, vc *pipepb.Coder, toAggregate [][]byte, coders map[string]*pipepb.Coder) []byte { // Pick how the timestamp of the aggregated output is computed. var outputTime func(typex.Window, mtime.Time, mtime.Time) mtime.Time switch ws.GetOutputTime() { @@ -333,9 +333,8 @@ func gbkBytes(ws *pipepb.WindowingStrategy, wc, kc, vc *pipepb.Coder, toAggregat kd := pullDecoder(kc, coders) vd := pullDecoder(vc, coders) - // Right, need to get the key coder, and the element coder. - // Cus I'll need to pull out anything the runner knows how to deal with. - // And repeat. + // Aggregate by windows and keys, using the window coder and KV coders. + // We need to extract and split the key bytes from the element bytes. for _, data := range toAggregate { // Parse out each element's data, and repeat. buf := bytes.NewBuffer(data) @@ -388,34 +387,41 @@ func gbkBytes(ws *pipepb.WindowingStrategy, wc, kc, vc *pipepb.Coder, toAggregat } // Use a decreasing sort (latest to earliest) so we can correct // the output timestamp to the new end of window immeadiately. - // TODO need to correct this if output time is different. sort.Slice(ordered, func(i, j int) bool { return ordered[i].MaxTimestamp() > ordered[j].MaxTimestamp() }) cur := ordered[0] sessionData := windows[cur] + delete(windows, cur) for _, iw := range ordered[1:] { - // If they overlap, then we merge the data. + // Check if the gap between windows is less than the gapSize. + // If not, this window is done, and we start a next window. if iw.End+gapSize < cur.Start { - // Start a new session. + // Store current data with the current window. windows[cur] = sessionData + // Use the incoming window instead, and clear it from the map. cur = iw sessionData = windows[iw] + delete(windows, cur) + // There's nothing to merge, since we've just started with this windowed data. continue } - // Extend the session + // Extend the session with the incoming window, and merge the the incoming window's data. cur.Start = iw.Start toMerge := windows[iw] delete(windows, iw) for k, kt := range toMerge { skt := sessionData[k] + // Ensure the output time matches the given function. + skt.time = outputTime(cur, kt.time, skt.time) skt.key = kt.key skt.w = cur skt.values = append(skt.values, kt.values...) sessionData[k] = skt } } + windows[cur] = sessionData } // Everything's aggregated! // Time to turn things into a windowed KV> diff --git a/sdks/python/apache_beam/runners/portability/prism_runner_test.py b/sdks/python/apache_beam/runners/portability/prism_runner_test.py index f1ccf66a22894..324fe5a17b545 100644 --- a/sdks/python/apache_beam/runners/portability/prism_runner_test.py +++ b/sdks/python/apache_beam/runners/portability/prism_runner_test.py @@ -40,7 +40,9 @@ from apache_beam.runners.portability import portable_runner_test from apache_beam.testing.util import assert_that from apache_beam.testing.util import equal_to +from apache_beam.transforms import window from apache_beam.transforms.sql import SqlTransform +from apache_beam.utils import timestamp # Run as # @@ -178,6 +180,26 @@ def create_options(self): return options + # Slightly more robust session window test: + # Validates that an inner grouping doesn't duplicate data either. + # Copied also because the timestamp in fn_runner_test.py isn't being + # inferred correctly as seconds for some reason, but as micros. + # The belabored specification is validating the timestamp type works at least. + # See https://github.com/apache/beam/issues/32085 + def test_windowing(self): + with self.create_pipeline() as p: + res = ( + p + | beam.Create([1, 2, 100, 101, 102, 123]) + | beam.Map( + lambda t: window.TimestampedValue( + ('k', t), timestamp.Timestamp.of(t).micros)) + | beam.WindowInto(beam.transforms.window.Sessions(10)) + | beam.GroupByKey() + | beam.Map(lambda k_vs1: (k_vs1[0], sorted(k_vs1[1])))) + assert_that( + res, equal_to([('k', [1, 2]), ('k', [100, 101, 102]), ('k', [123])])) + # Can't read host files from within docker, read a "local" file there. def test_read(self): print('name:', __name__) From 0a42afa9f5c02e9d529e1c1f1b197472a44cc174 Mon Sep 17 00:00:00 2001 From: Robert Burke Date: Tue, 6 Aug 2024 16:10:08 -0700 Subject: [PATCH 21/78] [prism] Use non-deprecated docker types in environment. (#32092) * [prism] Use non-deprecated docker types in environment. * Include Go mod changes. * Update testcontainers-go * revert toolchain change * go mod tidy requirement, update minimum Go version. * Note Minimum Go version update. --------- Co-authored-by: lostluck <13907733+lostluck@users.noreply.github.com> --- CHANGES.md | 1 + sdks/go.mod | 20 +++--- sdks/go.sum | 70 ++++++++++++++----- .../runners/prism/internal/environments.go | 8 +-- 4 files changed, 69 insertions(+), 30 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 7f12b53342602..129fa01f94a84 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -68,6 +68,7 @@ ## New Features / Improvements * X feature added (Java/Python) ([#X](https://github.com/apache/beam/issues/X)). +* Go SDK Minimum Go Version updated to 1.21 ([#32092](https://github.com/apache/beam/pull/32092)). ## Breaking Changes diff --git a/sdks/go.mod b/sdks/go.mod index 1716a6e2d22dc..7eb29b3cc77c3 100644 --- a/sdks/go.mod +++ b/sdks/go.mod @@ -20,7 +20,7 @@ // directory. module github.com/apache/beam/sdks/v2 -go 1.20 +go 1.21 require ( cloud.google.com/go/bigquery v1.61.0 @@ -49,7 +49,7 @@ require ( github.com/nats-io/nats.go v1.35.0 github.com/proullon/ramsql v0.1.3 github.com/spf13/cobra v1.8.1 - github.com/testcontainers/testcontainers-go v0.26.0 + github.com/testcontainers/testcontainers-go v0.32.0 github.com/tetratelabs/wazero v1.7.3 github.com/xitongsys/parquet-go v1.6.2 github.com/xitongsys/parquet-go-source v0.0.0-20220315005136-aec0fe3e777c @@ -79,21 +79,23 @@ require ( dario.cat/mergo v1.0.0 // indirect filippo.io/edwards25519 v1.1.0 // indirect github.com/GoogleCloudPlatform/grpc-gcp-go/grpcgcp v1.5.0 // indirect - github.com/Microsoft/hcsshim v0.11.4 // indirect + github.com/Microsoft/hcsshim v0.11.5 // indirect github.com/apache/arrow/go/v15 v15.0.2 // indirect + github.com/containerd/errdefs v0.1.0 // indirect github.com/containerd/log v0.1.0 // indirect - github.com/distribution/reference v0.5.0 // indirect + github.com/distribution/reference v0.6.0 // indirect github.com/go-logr/logr v1.4.1 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/go-ole/go-ole v1.2.6 // indirect github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect github.com/minio/highwayhash v1.0.2 // indirect + github.com/moby/docker-image-spec v1.3.1 // indirect github.com/moby/sys/user v0.1.0 // indirect github.com/nats-io/jwt/v2 v2.5.7 // indirect github.com/nats-io/nkeys v0.4.7 // indirect github.com/nats-io/nuid v1.0.1 // indirect github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect - github.com/shirou/gopsutil/v3 v3.23.9 // indirect + github.com/shirou/gopsutil/v3 v3.23.12 // indirect github.com/shoenig/go-m1cpu v0.1.6 // indirect github.com/tklauser/go-sysconf v0.3.12 // indirect github.com/tklauser/numcpus v0.6.1 // indirect @@ -114,7 +116,7 @@ require ( cloud.google.com/go/iam v1.1.8 // indirect cloud.google.com/go/longrunning v0.5.7 // indirect github.com/Azure/go-ansiterm v0.0.0-20210617225240-d185dfc1b5a1 // indirect - github.com/Microsoft/go-winio v0.6.1 // indirect + github.com/Microsoft/go-winio v0.6.2 // indirect github.com/apache/arrow/go/arrow v0.0.0-20200730104253-651201b0f516 // indirect github.com/apache/thrift v0.17.0 // indirect github.com/aws/aws-sdk-go v1.34.0 // indirect @@ -135,9 +137,9 @@ require ( github.com/census-instrumentation/opencensus-proto v0.4.1 // indirect github.com/cespare/xxhash/v2 v2.2.0 // indirect github.com/cncf/xds/go v0.0.0-20240318125728-8a4994d93e50 // indirect - github.com/containerd/containerd v1.7.11 // indirect + github.com/containerd/containerd v1.7.18 // indirect github.com/cpuguy83/dockercfg v0.3.1 // indirect - github.com/docker/docker v25.0.6+incompatible // but required to resolve issue docker has with go1.20 + github.com/docker/docker v27.1.1+incompatible // but required to resolve issue docker has with go1.20 github.com/docker/go-units v0.5.0 // indirect github.com/envoyproxy/go-control-plane v0.12.0 // indirect github.com/envoyproxy/protoc-gen-validate v1.0.4 // indirect @@ -165,7 +167,7 @@ require ( github.com/montanaflynn/stats v0.0.0-20171201202039-1bf9dbcd8cbe // indirect github.com/morikuni/aec v1.0.0 // indirect github.com/opencontainers/go-digest v1.0.0 // indirect - github.com/opencontainers/image-spec v1.1.0-rc5 // indirect + github.com/opencontainers/image-spec v1.1.0 // indirect github.com/pierrec/lz4/v4 v4.1.18 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/pkg/xattr v0.4.9 // indirect diff --git a/sdks/go.sum b/sdks/go.sum index a0b4738decc3d..ce10d84dd044e 100644 --- a/sdks/go.sum +++ b/sdks/go.sum @@ -209,6 +209,7 @@ cloud.google.com/go/datacatalog v1.8.1/go.mod h1:RJ58z4rMp3gvETA465Vg+ag8BGgBdnR cloud.google.com/go/datacatalog v1.12.0/go.mod h1:CWae8rFkfp6LzLumKOnmVh4+Zle4A3NXLzVJ1d1mRm0= cloud.google.com/go/datacatalog v1.13.0/go.mod h1:E4Rj9a5ZtAxcQJlEBTLgMTphfP11/lNaAshpoBgemX8= cloud.google.com/go/datacatalog v1.20.1 h1:czcba5mxwRM5V//jSadyig0y+8aOHmN7gUl9GbHu59E= +cloud.google.com/go/datacatalog v1.20.1/go.mod h1:Jzc2CoHudhuZhpv78UBAjMEg3w7I9jHA11SbRshWUjk= cloud.google.com/go/dataflow v0.6.0/go.mod h1:9QwV89cGoxjjSR9/r7eFDqqjtvbKxAK2BaYU6PVk9UM= cloud.google.com/go/dataflow v0.7.0/go.mod h1:PX526vb4ijFMesO1o202EaUmouZKBpjHsTlCtB4parQ= cloud.google.com/go/dataflow v0.8.0/go.mod h1:Rcf5YgTKPtQyYz8bLYhFoIV/vP39eL7fWNcSOyFfLJE= @@ -346,6 +347,7 @@ cloud.google.com/go/kms v1.9.0/go.mod h1:qb1tPTgfF9RQP8e1wq4cLFErVuTJv7UsSC915J8 cloud.google.com/go/kms v1.10.0/go.mod h1:ng3KTUtQQU9bPX3+QGLsflZIHlkbn8amFAMY63m8d24= cloud.google.com/go/kms v1.10.1/go.mod h1:rIWk/TryCkR59GMC3YtHtXeLzd634lBbKenvyySAyYI= cloud.google.com/go/kms v1.18.0 h1:pqNdaVmZJFP+i8OVLocjfpdTWETTYa20FWOegSCdrRo= +cloud.google.com/go/kms v1.18.0/go.mod h1:DyRBeWD/pYBMeyiaXFa/DGNyxMDL3TslIKb8o/JkLkw= cloud.google.com/go/language v1.4.0/go.mod h1:F9dRpNFQmJbkaop6g0JhSBXCNlO90e1KWx5iDdxbWic= cloud.google.com/go/language v1.6.0/go.mod h1:6dJ8t3B+lUYfStgls25GusK04NLh3eDLQnWM3mdEbhI= cloud.google.com/go/language v1.7.0/go.mod h1:DJ6dYN/W+SQOjF8e1hLQXMF21AkH2w9wiPzPCJa2MIE= @@ -628,6 +630,7 @@ filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4 gioui.org v0.0.0-20210308172011-57750fc8a0a6/go.mod h1:RSH6KIUZ0p2xy5zHDxgAM4zumjgTw83q2ge/PI+yyw8= git.sr.ht/~sbinet/gg v0.3.1/go.mod h1:KGYtlADtqsqANL9ueOFkWymvzUvLMQllU5Ixo+8v3pc= github.com/AdaLogics/go-fuzz-headers v0.0.0-20230811130428-ced1acdcaa24 h1:bvDV9vkmnHYOMsOr4WLk+Vo07yKIzd94sVoIqshQ4bU= +github.com/AdaLogics/go-fuzz-headers v0.0.0-20230811130428-ced1acdcaa24/go.mod h1:8o94RPi1/7XTJvwPpRSzSUedZrtlirdB3r9Z20bi2f8= github.com/Azure/azure-pipeline-go v0.2.3/go.mod h1:x841ezTBIMG6O3lAcl8ATHnsOPVl2bqk7S3ta6S6u4k= github.com/Azure/azure-storage-blob-go v0.14.0/go.mod h1:SMqIBi+SuiQH32bvyjngEewEeXoPfKMgWlBDaYf6fck= github.com/Azure/go-ansiterm v0.0.0-20210617225240-d185dfc1b5a1 h1:UQHMgLO+TxOElx5B5HZ4hJQsoJ/PvUvKRhJHDQXO8P8= @@ -643,10 +646,10 @@ github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym github.com/GoogleCloudPlatform/grpc-gcp-go/grpcgcp v1.5.0 h1:oVLqHXhnYtUwM89y9T1fXGaK9wTkXHgNp8/ZNMQzUxE= github.com/GoogleCloudPlatform/grpc-gcp-go/grpcgcp v1.5.0/go.mod h1:dppbR7CwXD4pgtV9t3wD1812RaLDcBjtblcDF5f1vI0= github.com/JohnCGriffin/overflow v0.0.0-20211019200055-46fa312c352c/go.mod h1:X0CRv0ky0k6m906ixxpzmDRLvX58TFUKS2eePweuyxk= -github.com/Microsoft/go-winio v0.6.1 h1:9/kr64B9VUZrLm5YYwbGtUJnMgqWVOdUAXu6Migciow= -github.com/Microsoft/go-winio v0.6.1/go.mod h1:LRdKpFKfdobln8UmuiYcKPot9D2v6svN5+sAH+4kjUM= -github.com/Microsoft/hcsshim v0.11.4 h1:68vKo2VN8DE9AdN4tnkWnmdhqdbpUFM8OF3Airm7fz8= -github.com/Microsoft/hcsshim v0.11.4/go.mod h1:smjE4dvqPX9Zldna+t5FG3rnoHhaB7QYxPRqGcpAD9w= +github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY= +github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU= +github.com/Microsoft/hcsshim v0.11.5 h1:haEcLNpj9Ka1gd3B3tAEs9CpE0c+1IhoL59w/exYU38= +github.com/Microsoft/hcsshim v0.11.5/go.mod h1:MV8xMfmECjl5HdO7U/3/hFVnkmSBjAjmA09d4bExKcU= github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU= github.com/ajstarks/deck v0.0.0-20200831202436-30c9fc6549a9/go.mod h1:JynElWSGnm/4RlzPXRlREEwqTHAN3T56Bv2ITsFT3gY= github.com/ajstarks/deck/generate v0.0.0-20210309230005-c3f852c02e19/go.mod h1:T13YZdzov6OU0A1+RfKZiZN9ca6VeKdBdyDV+BY97Tk= @@ -754,8 +757,10 @@ github.com/cncf/xds/go v0.0.0-20230607035331-e9ce68804cb4/go.mod h1:eXthEFrGJvWH github.com/cncf/xds/go v0.0.0-20240318125728-8a4994d93e50 h1:DBmgJDC9dTfkVyGgipamEh2BpGYxScCH1TOF1LL1cXc= github.com/cncf/xds/go v0.0.0-20240318125728-8a4994d93e50/go.mod h1:5e1+Vvlzido69INQaVO6d87Qn543Xr6nooe9Kz7oBFM= github.com/colinmarc/hdfs/v2 v2.1.1/go.mod h1:M3x+k8UKKmxtFu++uAZ0OtDU8jR3jnaZIAc6yK4Ue0c= -github.com/containerd/containerd v1.7.11 h1:lfGKw3eU35sjV0aG2eYZTiwFEY1pCzxdzicHP3SZILw= -github.com/containerd/containerd v1.7.11/go.mod h1:5UluHxHTX2rdvYuZ5OJTC5m/KJNs0Zs9wVoJm9zf5ZE= +github.com/containerd/containerd v1.7.18 h1:jqjZTQNfXGoEaZdW1WwPU0RqSn1Bm2Ay/KJPUuO8nao= +github.com/containerd/containerd v1.7.18/go.mod h1:IYEk9/IO6wAPUz2bCMVUbsfXjzw5UNP5fLz4PsUygQ4= +github.com/containerd/errdefs v0.1.0 h1:m0wCRBiu1WJT/Fr+iOoQHMQS/eP5myQ8lCv4Dz5ZURM= +github.com/containerd/errdefs v0.1.0/go.mod h1:YgWiiHtLmSeBrvpw+UfPijzbLaB77mEG1WwJTDETIV0= github.com/containerd/log v0.1.0 h1:TCJt7ioM2cr/tfR8GPbGf9/VRAX8D2B4PjzCpfX540I= github.com/containerd/log v0.1.0/go.mod h1:VRRf09a7mHDIRezVKTRCrOq78v577GXq3bSa3EhrzVo= github.com/cpuguy83/dockercfg v0.3.1 h1:/FpZ+JaygUR/lZP2NlFI2DVfrOEMAIKP5wWEJdoYe9E= @@ -763,13 +768,14 @@ github.com/cpuguy83/dockercfg v0.3.1/go.mod h1:sugsbF4//dDlL/i+S+rtpIWp+5h0BHJHf github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/creack/pty v1.1.18 h1:n56/Zwd5o6whRC5PMGretI4IdRLlmBXYNjScPaBgsbY= +github.com/creack/pty v1.1.18/go.mod h1:MOBLtS5ELjhRRrroQr9kyvTxUAFNvYEK993ew/Vr4O4= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/distribution/reference v0.5.0 h1:/FUIFXtfc/x2gpa5/VGfiGLuOIdYa1t65IKK2OFGvA0= -github.com/distribution/reference v0.5.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E= -github.com/docker/docker v25.0.6+incompatible h1:5cPwbwriIcsua2REJe8HqQV+6WlWc1byg2QSXzBxBGg= -github.com/docker/docker v25.0.6+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= +github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5QvfrDyIgxBk= +github.com/distribution/reference v0.6.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E= +github.com/docker/docker v27.1.1+incompatible h1:hO/M4MtV36kzKldqnA37IWhebRA+LnqqcqDja6kVaKY= +github.com/docker/docker v27.1.1+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= github.com/docker/go-connections v0.5.0 h1:USnMq7hx7gwdVZq1L49hLXaFtUdTADjXGp+uj1Br63c= github.com/docker/go-connections v0.5.0/go.mod h1:ov60Kzw0kKElRwhNs9UlUHAE/F9Fe6GLaXnqyDdmEXc= github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4= @@ -814,6 +820,7 @@ github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9 github.com/go-gl/glfw/v3.3/glfw v0.0.0-20191125211704-12ad95a8df72/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8= github.com/go-gl/glfw/v3.3/glfw v0.0.0-20200222043503-6f7a984d4dc4/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8= github.com/go-gorp/gorp v2.2.0+incompatible h1:xAUh4QgEeqPPhK3vxZN+bzrim1z5Av6q837gtjUlshc= +github.com/go-gorp/gorp v2.2.0+incompatible/go.mod h1:7IfkAQnO7jfT/9IQ3R9wL1dFhukN6aQxzKTHnkxzA/E= github.com/go-latex/latex v0.0.0-20210118124228-b3d85cf34e07/go.mod h1:CO1AlKB2CSIqUrmQPqA0gdRIlnLEY0gK5JGjh37zN5U= github.com/go-latex/latex v0.0.0-20210823091927-c0d11ff05a81/go.mod h1:SX0U8uGpxhq9o2S/CELCSUxEWWAuoCUcVCQWv7G2OCk= github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= @@ -881,6 +888,7 @@ github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEW github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= github.com/google/btree v1.1.2 h1:xf4v41cLI2Z6FxbKm+8Bu+m8ifhj15JuZ9sa0jZCMUU= +github.com/google/btree v1.1.2/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4= github.com/google/flatbuffers v1.11.0/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8= github.com/google/flatbuffers v2.0.8+incompatible/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8= github.com/google/flatbuffers v23.5.26+incompatible h1:M9dgRyhJemaM4Sw8+66GHBu8ioaQmyPLg1b8VwK5WJg= @@ -909,6 +917,7 @@ github.com/google/martian/v3 v3.1.0/go.mod h1:y5Zk1BBys9G+gd6Jrk0W3cC1+ELVxBWuIG github.com/google/martian/v3 v3.2.1/go.mod h1:oBOf6HBosgwRXnUGWUB05QECsc6uvmMiJ3+6W4l/CUk= github.com/google/martian/v3 v3.3.2/go.mod h1:oBOf6HBosgwRXnUGWUB05QECsc6uvmMiJ3+6W4l/CUk= github.com/google/martian/v3 v3.3.3 h1:DIhPTQrbPkgs2yJYdXU/eNACCG5DVQjySNRNlflZ9Fc= +github.com/google/martian/v3 v3.3.3/go.mod h1:iEPrYcgCF7jA9OtScMFQyAlZZ4YXTKEtJ1E6RWzmBA0= github.com/google/pprof v0.0.0-20181206194817-3ea8567a2e57/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc= github.com/google/pprof v0.0.0-20190515194954-54271f7e092f/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc= github.com/google/pprof v0.0.0-20191218002539-d4f498aebedc/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= @@ -967,6 +976,7 @@ github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFb github.com/grpc-ecosystem/grpc-gateway/v2 v2.7.0/go.mod h1:hgWBS7lorOAVIJEQMi4ZsPv9hVvWI6+ch50m39Pf2Ks= github.com/grpc-ecosystem/grpc-gateway/v2 v2.11.3/go.mod h1:o//XUCC/F+yRGJoPO/VU0GSB0f8Nhgmxx0VIRUvaC0w= github.com/grpc-ecosystem/grpc-gateway/v2 v2.16.0 h1:YBftPWNWd4WwGqtY2yeZL2ef8rHAxPBD8KFhJpmcqms= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.16.0/go.mod h1:YN5jB8ie0yfIUg6VvR9Kz84aCaG7AsGZnLjhHbUqwPg= github.com/hashicorp/go-uuid v0.0.0-20180228145832-27454136f036/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro= github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= @@ -976,11 +986,16 @@ github.com/ianlancetaylor/demangle v0.0.0-20200824232613-28f6c0f3b639/go.mod h1: github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM= +github.com/jackc/pgpassfile v1.0.0/go.mod h1:CEx0iS5ambNFdcRtxPj5JhEz+xB6uRky5eyVu/W2HEg= github.com/jackc/pgservicefile v0.0.0-20221227161230-091c0ba34f0a h1:bbPeKD0xmW/Y25WS6cokEszi5g+S0QxI/d45PkRi7Nk= +github.com/jackc/pgservicefile v0.0.0-20221227161230-091c0ba34f0a/go.mod h1:5TJZWKEWniPve33vlWYSoGYefn3gLQRzjfDlhSJ9ZKM= github.com/jackc/pgx/v5 v5.3.1 h1:Fcr8QJ1ZeLi5zsPZqQeUZhNhxfkkKBOgJuYkJHoBOtU= +github.com/jackc/pgx/v5 v5.3.1/go.mod h1:t3JDKnCBlYIc0ewLF0Q7B8MXmoIaBOZj/ic7iHozM/8= github.com/jcmturner/gofork v0.0.0-20180107083740-2aebee971930/go.mod h1:MK8+TM0La+2rjBD4jE12Kj1pCCxK7d2LK/UM3ncEo0o= github.com/jinzhu/inflection v1.0.0 h1:K317FqzuhWc8YvSVlFMCCUb36O/S9MCKRDI7QkRKD/E= +github.com/jinzhu/inflection v1.0.0/go.mod h1:h+uFLlag+Qp1Va5pdKtLDYj+kHp5pxUVkryuEj+Srlc= github.com/jinzhu/now v1.1.5 h1:/o9tlHleP7gOFmsnYNz3RGnqzefHA47wQpKrrdTIwXQ= +github.com/jinzhu/now v1.1.5/go.mod h1:d3SSVoowX0Lcu0IBviAWJpolVfI5UJVZZ7cO71lE/z8= github.com/jmespath/go-jmespath v0.0.0-20180206201540-c2b33e8439af/go.mod h1:Nht3zPeWKUH0NzdCt2Blrr5ys8VGpn0CEB0cQHVjt7k= github.com/jmespath/go-jmespath v0.3.0/go.mod h1:9QtRXoHjLGCJ5IBSaohpXITPlowMeeYCZ7fLUTSywik= github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg= @@ -990,6 +1005,7 @@ github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfC github.com/johannesboyne/gofakes3 v0.0.0-20221110173912-32fb85c5aed6 h1:eQGUsj2LcsLzfrHY1noKDSU7h+c9/rw9pQPwbQ9g1jQ= github.com/johannesboyne/gofakes3 v0.0.0-20221110173912-32fb85c5aed6/go.mod h1:LIAXxPvcUXwOcTIj9LSNSUpE9/eMHalTWxsP/kmWxQI= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= +github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU= github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/XSXhF0NWZEnDohbsk= github.com/jung-kurt/gofpdf v1.0.0/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes= @@ -1012,6 +1028,7 @@ github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORN github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pretty v0.3.0/go.mod h1:640gp4NfQd8pI5XOwp5fnNeVWj67G7CFk/SaSQn7NBk= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= @@ -1036,8 +1053,13 @@ github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3/go.mod h1:RagcQ7I8Ie github.com/minio/highwayhash v1.0.2 h1:Aak5U0nElisjDCfPSG79Tgzkn2gl66NxOMspRrKnA/g= github.com/minio/highwayhash v1.0.2/go.mod h1:BQskDq+xkJ12lmlUUi7U0M5Swg3EWR+dLTk+kldvVxY= github.com/minio/md5-simd v1.1.2 h1:Gdi1DZK69+ZVMoNHRXJyNcxrMA4dSxoYHZSQbirFg34= +github.com/minio/md5-simd v1.1.2/go.mod h1:MzdKDxYpY2BT9XQFocsiZf/NKVtR7nkE4RoEpN+20RM= github.com/minio/minio-go/v7 v7.0.66 h1:bnTOXOHjOqv/gcMuiVbN9o2ngRItvqE774dG9nq0Dzw= +github.com/minio/minio-go/v7 v7.0.66/go.mod h1:DHAgmyQEGdW3Cif0UooKOyrT3Vxs82zNdV6tkKhRtbs= github.com/minio/sha256-simd v1.0.1 h1:6kaan5IFmwTNynnKKpDHe6FWHohJOHhCPchzK49dzMM= +github.com/minio/sha256-simd v1.0.1/go.mod h1:Pz6AKMiUdngCLpeTL/RJY1M9rUuPMYujV5xJjtbRSN8= +github.com/moby/docker-image-spec v1.3.1 h1:jMKff3w6PgbfSa69GfNg+zN/XLhfXJGnEx3Nl2EsFP0= +github.com/moby/docker-image-spec v1.3.1/go.mod h1:eKmb5VW8vQEh/BAr2yvVNvuiJuY6UIocYsFu/DxxRpo= github.com/moby/patternmatcher v0.6.0 h1:GmP9lR19aU5GqSSFko+5pRqHi+Ohk1O69aFiKkVGiPk= github.com/moby/patternmatcher v0.6.0/go.mod h1:hDPoyOpDY7OrrMDLaYoY3hf52gNCR/YOUYxkhApJIxc= github.com/moby/sys/sequential v0.5.0 h1:OPvI35Lzn9K04PBbCLW0g4LcFAJgHsvXsRyewg5lXtc= @@ -1047,7 +1069,9 @@ github.com/moby/sys/user v0.1.0/go.mod h1:fKJhFOnsCN6xZ5gSfbM6zaHGgDJMrqt9/reuj4 github.com/moby/term v0.5.0 h1:xt8Q1nalod/v7BqbG21f8mQPqH+xAaC9C3N3wfWbVP0= github.com/moby/term v0.5.0/go.mod h1:8FzsFHVUBGZdbDsJw/ot+X+d5HLUbvklYLJ9uGfcI3Y= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= +github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/montanaflynn/stats v0.0.0-20171201202039-1bf9dbcd8cbe h1:iruDEfMl2E6fbMZ9s0scYfZQ84/6SPL6zC8ACM2oIL0= github.com/montanaflynn/stats v0.0.0-20171201202039-1bf9dbcd8cbe/go.mod h1:wL8QJuTMNUDYhXwkmfOly8iTdp5TEcJFWZD2D7SIkUc= github.com/morikuni/aec v1.0.0 h1:nP9CBfwrvYnBRgY6qfDQkygYDmYwOilePFkwzv4dU8A= @@ -1065,8 +1089,8 @@ github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OS github.com/ncw/swift v1.0.52/go.mod h1:23YIA4yWVnGwv2dQlN4bB7egfYX6YLn0Yo/S6zZO/ZM= github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= -github.com/opencontainers/image-spec v1.1.0-rc5 h1:Ygwkfw9bpDvs+c9E34SdgGOj41dX/cbdlwvlWt0pnFI= -github.com/opencontainers/image-spec v1.1.0-rc5/go.mod h1:X4pATf0uXsnn3g5aiGIsVnJBR4mxhKzfwmvK/B2NTm8= +github.com/opencontainers/image-spec v1.1.0 h1:8SG7/vwALn54lVB/0yZ/MMwhFrPYtpEHQb2IpWsCzug= +github.com/opencontainers/image-spec v1.1.0/go.mod h1:W4s4sFTMaBeK1BQLXbG4AdM2szdn85PY75RI83NrTrM= github.com/pborman/getopt v0.0.0-20180729010549-6fdd0a2c7117/go.mod h1:85jBQOZwpVEaDAr341tbn15RS4fCAsIst0qp7i8ex1o= github.com/phpdave11/gofpdf v1.4.2/go.mod h1:zpO6xFn9yxo3YLyMvW8HcKWVdbNqgIfOOp2dXMnm1mY= github.com/phpdave11/gofpdi v1.0.12/go.mod h1:vBmVV0Do6hSBHC8uKUQ71JGW+ZGQq74llk/7bXwjDoI= @@ -1099,6 +1123,7 @@ github.com/rogpeppe/go-internal v1.6.1/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTE github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8= github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs= github.com/rs/xid v1.5.0 h1:mKX4bl4iPYJtEIxp6CYiUuLQ/8DYMoz0PUdtGgMFRVc= +github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/ruudk/golang-pdf417 v0.0.0-20181029194003-1af4ab5afa58/go.mod h1:6lfFZQK844Gfx8o5WFuvpxWRwnSoipWe/p622j1v06w= github.com/ruudk/golang-pdf417 v0.0.0-20201230142125-a7e3863a1245/go.mod h1:pQAZKsJ8yyVxGRWYNEm9oFB8ieLgKFnamEyDmSA0BRk= @@ -1106,8 +1131,8 @@ github.com/ryszard/goskiplist v0.0.0-20150312221310-2dfbae5fcf46 h1:GHRpF1pTW19a github.com/ryszard/goskiplist v0.0.0-20150312221310-2dfbae5fcf46/go.mod h1:uAQ5PCi+MFsC7HjREoAz1BU+Mq60+05gifQSsHSDG/8= github.com/shabbyrobe/gocovmerge v0.0.0-20180507124511-f6ea450bfb63 h1:J6qvD6rbmOil46orKqJaRPG+zTpoGlBTUdyv8ki63L0= github.com/shabbyrobe/gocovmerge v0.0.0-20180507124511-f6ea450bfb63/go.mod h1:n+VKSARF5y/tS9XFSP7vWDfS+GUC5vs/YT7M5XDTUEM= -github.com/shirou/gopsutil/v3 v3.23.9 h1:ZI5bWVeu2ep4/DIxB4U9okeYJ7zp/QLTO4auRb/ty/E= -github.com/shirou/gopsutil/v3 v3.23.9/go.mod h1:x/NWSb71eMcjFIO0vhyGW5nZ7oSIgVjrCnADckb85GA= +github.com/shirou/gopsutil/v3 v3.23.12 h1:z90NtUkp3bMtmICZKpC4+WaknU1eXtp5vtbQ11DgpE4= +github.com/shirou/gopsutil/v3 v3.23.12/go.mod h1:1FrWgea594Jp7qmjHUUPlJDTPgcsb9mGnXDxavtikzM= github.com/shoenig/go-m1cpu v0.1.6 h1:nxdKQNcEB6vzgA2E2bvzKIYRuNj7XNJ4S/aRSwKzFtM= github.com/shoenig/go-m1cpu v0.1.6/go.mod h1:1JJMcUBvfNwpq05QDQVAnx3gUHr9IYF7GNg9SUEw2VQ= github.com/shoenig/test v0.6.4 h1:kVTaSd7WLz5WZ2IaoM0RSzRsUD+m8wRR+5qvntpn4LU= @@ -1141,8 +1166,9 @@ github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o github.com/stretchr/testify v1.8.3/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= -github.com/testcontainers/testcontainers-go v0.26.0 h1:uqcYdoOHBy1ca7gKODfBd9uTHVK3a7UL848z09MVZ0c= -github.com/testcontainers/testcontainers-go v0.26.0/go.mod h1:ICriE9bLX5CLxL9OFQ2N+2N+f+803LNJ1utJb1+Inx0= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/testcontainers/testcontainers-go v0.32.0 h1:ug1aK08L3gCHdhknlTTwWjPHPS+/alvLJU/DRxTD/ME= +github.com/testcontainers/testcontainers-go v0.32.0/go.mod h1:CRHrzHLQhlXUsa5gXjTOfqIEJcrK5+xMDmBr/WMI88E= github.com/tetratelabs/wazero v1.7.3 h1:PBH5KVahrt3S2AHgEjKu4u+LlDbbk+nsGE3KLucy6Rw= github.com/tetratelabs/wazero v1.7.3/go.mod h1:ytl6Zuh20R/eROuyDaGPkp82O9C/DJfXAwJfQ3X6/7Y= github.com/tklauser/go-sysconf v0.3.12 h1:0QaGUFOdQaIVdPgfITYzaTegZvdCjmYO52cSFAEVmqU= @@ -1198,7 +1224,9 @@ go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.49.0/go.mod h1: go.opentelemetry.io/otel v1.24.0 h1:0LAOdjNmQeSTzGBzduGe/rU4tZhMwL5rWgtp9Ku5Jfo= go.opentelemetry.io/otel v1.24.0/go.mod h1:W7b9Ozg4nkF5tWI5zsXkaKKDjdVjpD4oAt9Qi/MArHo= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.19.0 h1:Mne5On7VWdx7omSrSSZvM4Kw7cS7NQkOOmLcgscI51U= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.19.0/go.mod h1:IPtUMKL4O3tH5y+iXVyAXqpAwMuzC1IrxVS81rummfE= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.19.0 h1:IeMeyr1aBvBiPVYihXIaeIZba6b8E1bYp7lbdxK8CQg= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.19.0/go.mod h1:oVdCUtjq9MK9BlS7TtucsQwUcXcymNiEDjgDD2jMtZU= go.opentelemetry.io/otel/metric v1.24.0 h1:6EhoGWWK28x1fbpA4tYTOWBkPefTDQnb8WSGXlc88kI= go.opentelemetry.io/otel/metric v1.24.0/go.mod h1:VYhLe1rFfxuTXLgj4CBiyz+9WYBA8pNGJgDcSFRKBco= go.opentelemetry.io/otel/sdk v1.24.0 h1:YMPPDNymmQN3ZgczicBY3B6sf9n62Dlj9pWD3ucgoDw= @@ -1209,6 +1237,7 @@ go.opentelemetry.io/proto/otlp v0.7.0/go.mod h1:PqfVotwruBrMGOCsRd/89rSnXhoiJIqe go.opentelemetry.io/proto/otlp v0.15.0/go.mod h1:H7XAot3MsfNsj7EXtrA2q5xSNQ10UqI405h3+duxN4U= go.opentelemetry.io/proto/otlp v0.19.0/go.mod h1:H7XAot3MsfNsj7EXtrA2q5xSNQ10UqI405h3+duxN4U= go.opentelemetry.io/proto/otlp v1.0.0 h1:T0TX0tmXU8a3CbNXzEKGeU5mIVOdf0oykP+u2lIVU/I= +go.opentelemetry.io/proto/otlp v1.0.0/go.mod h1:Sy6pihPLfYHkr3NkUbEhGHFhINUSI/v80hjKIs5JXpM= golang.org/x/crypto v0.0.0-20180723164146-c126467f60eb/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= @@ -1482,7 +1511,7 @@ golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.21.0 h1:rF+pYz3DAGSQAxAu1CbC7catZg4ebC4UIeIhKxBZvws= golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= @@ -1493,6 +1522,8 @@ golang.org/x/term v0.4.0/go.mod h1:9P2UbLfCdcvo3p/nzKvsmas4TnlujnuoV9hGgYzW1lQ= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= golang.org/x/term v0.6.0/go.mod h1:m6U89DPEgQRMq3DNkDClhWw02AUbt2daBVO4cn4Hv9U= golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY= +golang.org/x/term v0.21.0 h1:WVXCp+/EBEHOj53Rvu+7KiT/iElMrO8ACK16SMZ3jaA= +golang.org/x/term v0.21.0/go.mod h1:ooXLefLobQVslOqselCNF4SxFAaoS6KujMbsGzSDmX0= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= @@ -1600,6 +1631,7 @@ gonum.org/v1/gonum v0.8.2/go.mod h1:oe/vMfY3deqTw+1EZJhuvEW2iwGF1bW9wwu7XCu0+v0= gonum.org/v1/gonum v0.9.3/go.mod h1:TZumC3NeyVQskjXqmyWt4S3bINhy7B4eYwW69EbyX+0= gonum.org/v1/gonum v0.11.0/go.mod h1:fSG4YDCxxUZQJ7rKsQrj0gMOg00Il0Z96/qMA4bVQhA= gonum.org/v1/gonum v0.12.0 h1:xKuo6hzt+gMav00meVPUlXwSdoEJP46BR+wdxQEFK2o= +gonum.org/v1/gonum v0.12.0/go.mod h1:73TDxJfAAHeA8Mk9mf8NlIppyhQNo5GLTcYeqgo2lvY= gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw= gonum.org/v1/plot v0.0.0-20190515093506-e2840ee46a6b/go.mod h1:Wt8AAjI+ypCyYX3nZBvf6cAIx93T+c/OS2HFAYskSZc= gonum.org/v1/plot v0.9.0/go.mod h1:3Pcqqmp6RHvJI72kgb8fThyUnav364FOsdDo2aGW5lY= @@ -1877,6 +1909,7 @@ gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntN gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI= gopkg.in/ini.v1 v1.67.0 h1:Dgnx+6+nfE+IfzjUEISNeydPJh9AXNNsWbGP9KzCsOA= +gopkg.in/ini.v1 v1.67.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k= gopkg.in/jcmturner/aescts.v1 v1.0.1/go.mod h1:nsR8qBOg+OucoIW+WMhB3GspUQXq9XorLnQb9XtvcOo= gopkg.in/jcmturner/dnsutils.v1 v1.0.1/go.mod h1:m3v+5svpVOhtFAP/wSz+yzh4Mc0Fg7eRhxkJMWSIz9Q= gopkg.in/jcmturner/goidentity.v3 v3.0.0/go.mod h1:oG2kH0IvSYNIu80dVAyu/yoefjq1mNfM5bm88whjWx4= @@ -1892,8 +1925,11 @@ gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gorm.io/driver/postgres v1.5.2 h1:ytTDxxEv+MplXOfFe3Lzm7SjG09fcdb3Z/c056DTBx0= +gorm.io/driver/postgres v1.5.2/go.mod h1:fmpX0m2I1PKuR7mKZiEluwrP3hbs+ps7JIGMUBpCgl8= gorm.io/gorm v1.25.2 h1:gs1o6Vsa+oVKG/a9ElL3XgyGfghFfkKA2SInQaCyMho= +gorm.io/gorm v1.25.2/go.mod h1:L4uxeKpfBml98NYqVqwAdmV1a2nBtAec/cf3fpucW/k= gotest.tools/v3 v3.5.1 h1:EENdUnS3pdur5nybKYIh2Vfgc8IUNBjxDPSjtiJcOzU= +gotest.tools/v3 v3.5.1/go.mod h1:isy3WKz7GK6uNw/sbHzfKBLvlvXwUyV06n6brMxxopU= honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190106161140-3f1c8253044a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190418001031-e561f6794a2a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= diff --git a/sdks/go/pkg/beam/runners/prism/internal/environments.go b/sdks/go/pkg/beam/runners/prism/internal/environments.go index 3a429920fb289..add7f769a702e 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/environments.go +++ b/sdks/go/pkg/beam/runners/prism/internal/environments.go @@ -32,8 +32,8 @@ import ( "google.golang.org/grpc/credentials/insecure" "google.golang.org/protobuf/proto" - dtyp "github.com/docker/docker/api/types" "github.com/docker/docker/api/types/container" + "github.com/docker/docker/api/types/image" "github.com/docker/docker/api/types/mount" dcli "github.com/docker/docker/client" "github.com/docker/docker/pkg/stdcopy" @@ -132,7 +132,7 @@ func dockerEnvironment(ctx context.Context, logger *slog.Logger, dp *pipepb.Dock } if _, _, err := cli.ImageInspectWithRaw(ctx, dp.GetContainerImage()); err != nil { // We don't have a local image, so we should pull it. - if rc, err := cli.ImagePull(ctx, dp.GetContainerImage(), dtyp.ImagePullOptions{}); err == nil { + if rc, err := cli.ImagePull(ctx, dp.GetContainerImage(), image.PullOptions{}); err == nil { // Copy the output, but discard it so we can wait until the image pull is finished. io.Copy(io.Discard, rc) rc.Close() @@ -164,7 +164,7 @@ func dockerEnvironment(ctx context.Context, logger *slog.Logger, dp *pipepb.Dock containerID := ccr.ID logger = logger.With("container", containerID) - if err := cli.ContainerStart(ctx, containerID, dtyp.ContainerStartOptions{}); err != nil { + if err := cli.ContainerStart(ctx, containerID, container.StartOptions{}); err != nil { cli.Close() return fmt.Errorf("unable to start container image %v with docker for env %v, err: %w", dp.GetContainerImage(), wk.Env, err) } @@ -189,7 +189,7 @@ func dockerEnvironment(ctx context.Context, logger *slog.Logger, dp *pipepb.Dock case resp := <-statusCh: logger.Info("docker container has self terminated", "status_code", resp.StatusCode) - rc, err := cli.ContainerLogs(ctx, containerID, dtyp.ContainerLogsOptions{Details: true, ShowStdout: true, ShowStderr: true}) + rc, err := cli.ContainerLogs(ctx, containerID, container.LogsOptions{Details: true, ShowStdout: true, ShowStderr: true}) if err != nil { logger.Error("docker container logs error", "error", err) } From 9b564ef925b83ca040c46d54314c600f5e65940c Mon Sep 17 00:00:00 2001 From: tvalentyn Date: Tue, 6 Aug 2024 17:33:29 -0700 Subject: [PATCH 22/78] Exclude a not yet implemented pandas op from dataframe tests. (#32066) --- sdks/python/apache_beam/dataframe/pandas_doctests_test.py | 1 + sdks/python/setup.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/sdks/python/apache_beam/dataframe/pandas_doctests_test.py b/sdks/python/apache_beam/dataframe/pandas_doctests_test.py index a4bd0d0a81278..c7ea908a93365 100644 --- a/sdks/python/apache_beam/dataframe/pandas_doctests_test.py +++ b/sdks/python/apache_beam/dataframe/pandas_doctests_test.py @@ -557,6 +557,7 @@ def test_series_tests(self): 'pandas.core.series.Series': ['ser.iloc[0] = 999'], }, not_implemented_ok={ + 'pandas.core.series.Series.case_when': ['*'], 'pandas.core.series.Series.transform': [ # str arg not supported. Tested with np.sum in # frames_test.py::DeferredFrameTest::test_groupby_transform_sum diff --git a/sdks/python/setup.py b/sdks/python/setup.py index c9b2d087d04ca..756c952b0101b 100644 --- a/sdks/python/setup.py +++ b/sdks/python/setup.py @@ -155,7 +155,7 @@ def cythonize(*args, **kwargs): # Exclude 1.5.0 and 1.5.1 because of # https://github.com/pandas-dev/pandas/issues/45725 dataframe_dependency = [ - 'pandas>=1.4.3,!=1.5.0,!=1.5.1,<2.2;python_version>="3.8"', + 'pandas>=1.4.3,!=1.5.0,!=1.5.1,<2.3;python_version>="3.8"', ] From eeddc6924c3230f6b502af67914918730a27efc9 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 6 Aug 2024 17:36:59 -0700 Subject: [PATCH 23/78] Bump google.golang.org/grpc from 1.64.0 to 1.65.0 in /sdks (#31824) Bumps [google.golang.org/grpc](https://github.com/grpc/grpc-go) from 1.64.0 to 1.65.0. - [Release notes](https://github.com/grpc/grpc-go/releases) - [Commits](https://github.com/grpc/grpc-go/compare/v1.64.0...v1.65.0) --- updated-dependencies: - dependency-name: google.golang.org/grpc dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- sdks/go.mod | 7 ++++--- sdks/go.sum | 13 ++++++++----- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/sdks/go.mod b/sdks/go.mod index 7eb29b3cc77c3..9cdad36010e43 100644 --- a/sdks/go.mod +++ b/sdks/go.mod @@ -61,7 +61,7 @@ require ( golang.org/x/text v0.16.0 google.golang.org/api v0.187.0 google.golang.org/genproto v0.0.0-20240624140628-dc46fd24d27d - google.golang.org/grpc v1.64.1 + google.golang.org/grpc v1.65.0 google.golang.org/protobuf v1.34.2 gopkg.in/yaml.v2 v2.4.0 gopkg.in/yaml.v3 v3.0.1 @@ -74,6 +74,7 @@ require ( ) require ( + cel.dev/expr v0.15.0 // indirect cloud.google.com/go/auth v0.6.1 // indirect cloud.google.com/go/auth/oauth2adapt v0.2.2 // indirect dario.cat/mergo v1.0.0 // indirect @@ -135,8 +136,8 @@ require ( github.com/aws/aws-sdk-go-v2/service/sts v1.28.12 // indirect github.com/cenkalti/backoff/v4 v4.2.1 // indirect github.com/census-instrumentation/opencensus-proto v0.4.1 // indirect - github.com/cespare/xxhash/v2 v2.2.0 // indirect - github.com/cncf/xds/go v0.0.0-20240318125728-8a4994d93e50 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect + github.com/cncf/xds/go v0.0.0-20240423153145-555b57ec207b // indirect github.com/containerd/containerd v1.7.18 // indirect github.com/cpuguy83/dockercfg v0.3.1 // indirect github.com/docker/docker v27.1.1+incompatible // but required to resolve issue docker has with go1.20 diff --git a/sdks/go.sum b/sdks/go.sum index ce10d84dd044e..935009cf1d83d 100644 --- a/sdks/go.sum +++ b/sdks/go.sum @@ -1,3 +1,5 @@ +cel.dev/expr v0.15.0 h1:O1jzfJCQBfL5BFoYktaxwIhuttaQPsVWerH9/EEKx0w= +cel.dev/expr v0.15.0/go.mod h1:TRSuuV7DlVCE/uwv5QbAiW/v8l5O8C4eEPHeu7gf7Sg= cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= cloud.google.com/go v0.38.0/go.mod h1:990N+gfupTy94rShfmMCWGDn0LpTmnzTp2qbd1dvSRU= @@ -735,8 +737,9 @@ github.com/census-instrumentation/opencensus-proto v0.4.1 h1:iKLQ0xPNFxR/2hzXZMr github.com/census-instrumentation/opencensus-proto v0.4.1/go.mod h1:4T9NM4+4Vw91VeyqjLS6ao50K5bOcLKN6Q42XnYaRYw= github.com/cespare/xxhash v1.1.0/go.mod h1:XrSqR1VqqWfGrhpAt58auRo0WTKS1nRRg3ghfAqPWnc= github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= -github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44= github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI= github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI= github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU= @@ -754,8 +757,8 @@ github.com/cncf/xds/go v0.0.0-20211011173535-cb28da3451f1/go.mod h1:eXthEFrGJvWH github.com/cncf/xds/go v0.0.0-20220314180256-7f1daf1720fc/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= github.com/cncf/xds/go v0.0.0-20230105202645-06c439db220b/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= github.com/cncf/xds/go v0.0.0-20230607035331-e9ce68804cb4/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= -github.com/cncf/xds/go v0.0.0-20240318125728-8a4994d93e50 h1:DBmgJDC9dTfkVyGgipamEh2BpGYxScCH1TOF1LL1cXc= -github.com/cncf/xds/go v0.0.0-20240318125728-8a4994d93e50/go.mod h1:5e1+Vvlzido69INQaVO6d87Qn543Xr6nooe9Kz7oBFM= +github.com/cncf/xds/go v0.0.0-20240423153145-555b57ec207b h1:ga8SEFjZ60pxLcmhnThWgvH2wg8376yUJmPhEH4H3kw= +github.com/cncf/xds/go v0.0.0-20240423153145-555b57ec207b/go.mod h1:W+zGtBO5Y1IgJhy4+A9GOqVhqLpfZi+vwmdNXUehLA8= github.com/colinmarc/hdfs/v2 v2.1.1/go.mod h1:M3x+k8UKKmxtFu++uAZ0OtDU8jR3jnaZIAc6yK4Ue0c= github.com/containerd/containerd v1.7.18 h1:jqjZTQNfXGoEaZdW1WwPU0RqSn1Bm2Ay/KJPUuO8nao= github.com/containerd/containerd v1.7.18/go.mod h1:IYEk9/IO6wAPUz2bCMVUbsfXjzw5UNP5fLz4PsUygQ4= @@ -1881,8 +1884,8 @@ google.golang.org/grpc v1.52.3/go.mod h1:pu6fVzoFb+NBYNAvQL08ic+lvB2IojljRYuun5v google.golang.org/grpc v1.53.0/go.mod h1:OnIrk0ipVdj4N5d9IUoFUx72/VlD7+jUsHwZgwSMQpw= google.golang.org/grpc v1.54.0/go.mod h1:PUSEXI6iWghWaB6lXM4knEgpJNu2qUcKfDtNci3EC2g= google.golang.org/grpc v1.56.3/go.mod h1:I9bI3vqKfayGqPUAwGdOSu7kt6oIJLixfffKrpXqQ9s= -google.golang.org/grpc v1.64.1 h1:LKtvyfbX3UGVPFcGqJ9ItpVWW6oN/2XqTxfAnwRRXiA= -google.golang.org/grpc v1.64.1/go.mod h1:hiQF4LFZelK2WKaP6W0L92zGHtiQdZxk8CrSdvyjeP0= +google.golang.org/grpc v1.65.0 h1:bs/cUb4lp1G5iImFFd3u5ixQzweKizoZJAwBNLR42lc= +google.golang.org/grpc v1.65.0/go.mod h1:WgYC2ypjlB0EiQi6wdKixMqukr6lBc0Vo+oOgjrM5ZQ= google.golang.org/grpc/cmd/protoc-gen-go-grpc v1.1.0/go.mod h1:6Kw0yEErY5E/yWrBtf03jp27GLLJujG4z/JK95pnjjw= google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= From 99672af7fe12f72b562289d0c9449e4711b973c2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 6 Aug 2024 17:38:40 -0700 Subject: [PATCH 24/78] Bump torch from 1.13.1 to 2.2.0 in /sdks/python/apache_beam/examples/ml-orchestration/kfp/components/train (#31983) Bumps [torch](https://github.com/pytorch/pytorch) from 1.13.1 to 2.2.0. - [Release notes](https://github.com/pytorch/pytorch/releases) - [Changelog](https://github.com/pytorch/pytorch/blob/main/RELEASE.md) - [Commits](https://github.com/pytorch/pytorch/compare/v1.13.1...v2.2.0) --- updated-dependencies: - dependency-name: torch dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .../ml-orchestration/kfp/components/train/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/train/requirements.txt b/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/train/requirements.txt index 2e65f0fba2468..ba1103dd1ef96 100644 --- a/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/train/requirements.txt +++ b/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/train/requirements.txt @@ -13,6 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -torch==1.13.1 +torch==2.2.0 numpy==1.22.4 Pillow==10.2.0 \ No newline at end of file From ebba3bb026b63f05d358a2cb5608cf61107ee504 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 6 Aug 2024 22:23:26 -0700 Subject: [PATCH 25/78] Bump go.mongodb.org/mongo-driver from 1.13.1 to 1.16.0 in /sdks (#32097) Bumps [go.mongodb.org/mongo-driver](https://github.com/mongodb/mongo-go-driver) from 1.13.1 to 1.16.0. - [Release notes](https://github.com/mongodb/mongo-go-driver/releases) - [Commits](https://github.com/mongodb/mongo-go-driver/compare/v1.13.1...v1.16.0) --- updated-dependencies: - dependency-name: go.mongodb.org/mongo-driver dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- sdks/go.mod | 4 ++-- sdks/go.sum | 11 ++++------- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/sdks/go.mod b/sdks/go.mod index 9cdad36010e43..2d638fc1998ff 100644 --- a/sdks/go.mod +++ b/sdks/go.mod @@ -53,7 +53,7 @@ require ( github.com/tetratelabs/wazero v1.7.3 github.com/xitongsys/parquet-go v1.6.2 github.com/xitongsys/parquet-go-source v0.0.0-20220315005136-aec0fe3e777c - go.mongodb.org/mongo-driver v1.13.1 + go.mongodb.org/mongo-driver v1.16.0 golang.org/x/net v0.26.0 golang.org/x/oauth2 v0.21.0 golang.org/x/sync v0.7.0 @@ -165,7 +165,7 @@ require ( github.com/moby/patternmatcher v0.6.0 // indirect github.com/moby/sys/sequential v0.5.0 // indirect github.com/moby/term v0.5.0 // indirect - github.com/montanaflynn/stats v0.0.0-20171201202039-1bf9dbcd8cbe // indirect + github.com/montanaflynn/stats v0.7.1 // indirect github.com/morikuni/aec v1.0.0 // indirect github.com/opencontainers/go-digest v1.0.0 // indirect github.com/opencontainers/image-spec v1.1.0 // indirect diff --git a/sdks/go.sum b/sdks/go.sum index 935009cf1d83d..138c484884d55 100644 --- a/sdks/go.sum +++ b/sdks/go.sum @@ -1019,7 +1019,6 @@ github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+o github.com/klauspost/asmfmt v1.3.2/go.mod h1:AG8TuvYojzulgDAMCnYn50l/5QV3Bs/tp6j0HLHbNSE= github.com/klauspost/compress v1.9.7/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A= github.com/klauspost/compress v1.13.1/go.mod h1:8dP1Hq4DHOhN9w426knH3Rhby4rFm6D8eO+e+Dq5Gzg= -github.com/klauspost/compress v1.13.6/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk= github.com/klauspost/compress v1.15.9/go.mod h1:PhcZ0MbTNciWF3rruxRgKxI5NkcHHrHUDtV4Yw2GlzU= github.com/klauspost/compress v1.17.8 h1:YcnTYrq7MikUT7k0Yb5eceMmALQPYBW/Xltxn0NAMnU= github.com/klauspost/compress v1.17.8/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw= @@ -1075,8 +1074,8 @@ github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= -github.com/montanaflynn/stats v0.0.0-20171201202039-1bf9dbcd8cbe h1:iruDEfMl2E6fbMZ9s0scYfZQ84/6SPL6zC8ACM2oIL0= -github.com/montanaflynn/stats v0.0.0-20171201202039-1bf9dbcd8cbe/go.mod h1:wL8QJuTMNUDYhXwkmfOly8iTdp5TEcJFWZD2D7SIkUc= +github.com/montanaflynn/stats v0.7.1 h1:etflOAAHORrCC44V+aR6Ftzort912ZU+YLiSTuV8eaE= +github.com/montanaflynn/stats v0.7.1/go.mod h1:etXPPgVO6n31NxCd9KQUMvCM+ve0ruNzt6R8Bnaayow= github.com/morikuni/aec v1.0.0 h1:nP9CBfwrvYnBRgY6qfDQkygYDmYwOilePFkwzv4dU8A= github.com/morikuni/aec v1.0.0/go.mod h1:BbKIizmSmc5MMPqRYbxO4ZU0S0+P200+tUnFx7PXmsc= github.com/nats-io/jwt/v2 v2.5.7 h1:j5lH1fUXCnJnY8SsQeB/a/z9Azgu2bYIDvtPVNdxe2c= @@ -1209,8 +1208,8 @@ github.com/zeebo/xxh3 v1.0.2/go.mod h1:5NWz9Sef7zIDm2JHfFlcQvNekmcEl9ekUZQQKCYaD go.einride.tech/aip v0.67.1 h1:d/4TW92OxXBngkSOwWS2CH5rez869KpKMaN44mdxkFI= go.einride.tech/aip v0.67.1/go.mod h1:ZGX4/zKw8dcgzdLsrvpOOGxfxI2QSk12SlP7d6c0/XI= go.etcd.io/bbolt v1.3.5/go.mod h1:G5EMThwa9y8QZGBClrRx5EY+Yw9kAhnjy3bSjsnlVTQ= -go.mongodb.org/mongo-driver v1.13.1 h1:YIc7HTYsKndGK4RFzJ3covLz1byri52x0IoMB0Pt/vk= -go.mongodb.org/mongo-driver v1.13.1/go.mod h1:wcDf1JBCXy2mOW0bWHwO/IOYqdca1MPCwDtFu/Z9+eo= +go.mongodb.org/mongo-driver v1.16.0 h1:tpRsfBJMROVHKpdGyc1BBEzzjDUWjItxbVSZ8Ls4BQ4= +go.mongodb.org/mongo-driver v1.16.0/go.mod h1:oB6AhJQvFQL4LEHyXi6aJzQJtBiTQHiAd83l0GdFaiw= go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU= go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8= go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= @@ -1253,7 +1252,6 @@ golang.org/x/crypto v0.0.0-20210421170649-83a5a9bb288b/go.mod h1:T9bdIzuCu7OtxOm golang.org/x/crypto v0.0.0-20210513164829-c07d793c2f9a/go.mod h1:P+XmwS30IXTQdn5tA2iutPOUgjI07+tq3H3K9MVA1s8= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.0.0-20211108221036-ceb1ce70b4fa/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= -golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= golang.org/x/crypto v0.24.0 h1:mnl8DM0o513X8fdIkmyFE/5hTYxbwYOjDS/+rK6qpRI= golang.org/x/crypto v0.24.0/go.mod h1:Z1PMYSOR5nyMcyAVAIQSKCDwalqy85Aqn1x3Ws4L5DM= golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= @@ -1357,7 +1355,6 @@ golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96b golang.org/x/net v0.0.0-20210503060351-7fd8e65b6420/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.0.0-20210813160813-60bc85c4be6d/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.0.0-20211015210444-4f30a5c0130f/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= -golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.0.0-20220127200216-cd36cc0744dd/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= golang.org/x/net v0.0.0-20220225172249-27dd8689420f/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= golang.org/x/net v0.0.0-20220325170049-de3da57026de/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= From 44a9942719e754fbd1967e475466f404b9f47e22 Mon Sep 17 00:00:00 2001 From: Danny McCormick Date: Wed, 7 Aug 2024 08:19:28 +0200 Subject: [PATCH 26/78] Add warning + doc callout when encountering ri pickling errors (#32063) --- sdks/python/apache_beam/ml/inference/base.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/sdks/python/apache_beam/ml/inference/base.py b/sdks/python/apache_beam/ml/inference/base.py index 401b57fdb8079..29a568def07b9 100644 --- a/sdks/python/apache_beam/ml/inference/base.py +++ b/sdks/python/apache_beam/ml/inference/base.py @@ -1586,6 +1586,15 @@ def _run_inference(self, batch, inference_args): except BaseException as e: if self._metrics_collector: self._metrics_collector.failed_batches_counter.inc() + if (e is pickle.PickleError and + self._model_handler.share_model_across_processes()): + raise TypeError( + 'Pickling error encountered while running inference. ' + 'This may be caused by trying to send unpickleable ' + 'data to a model which is shared across processes. ' + 'For more information, see ' + 'https://beam.apache.org/documentation/ml/large-language-modeling/#pickling-errors' # pylint: disable=line-too-long + ) from e raise e predictions = list(result_generator) From 0d81c5993049bc72116c2871c9b50bbe4cfc43d5 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 7 Aug 2024 10:08:13 -0400 Subject: [PATCH 27/78] Bump golang.org/x/text from 0.16.0 to 0.17.0 in /sdks (#32098) Bumps [golang.org/x/text](https://github.com/golang/text) from 0.16.0 to 0.17.0. - [Release notes](https://github.com/golang/text/releases) - [Commits](https://github.com/golang/text/compare/v0.16.0...v0.17.0) --- updated-dependencies: - dependency-name: golang.org/x/text dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- sdks/go.mod | 4 ++-- sdks/go.sum | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/sdks/go.mod b/sdks/go.mod index 2d638fc1998ff..958a228c3546b 100644 --- a/sdks/go.mod +++ b/sdks/go.mod @@ -56,9 +56,9 @@ require ( go.mongodb.org/mongo-driver v1.16.0 golang.org/x/net v0.26.0 golang.org/x/oauth2 v0.21.0 - golang.org/x/sync v0.7.0 + golang.org/x/sync v0.8.0 golang.org/x/sys v0.21.0 - golang.org/x/text v0.16.0 + golang.org/x/text v0.17.0 google.golang.org/api v0.187.0 google.golang.org/genproto v0.0.0-20240624140628-dc46fd24d27d google.golang.org/grpc v1.65.0 diff --git a/sdks/go.sum b/sdks/go.sum index 138c484884d55..ef3c436f22e02 100644 --- a/sdks/go.sum +++ b/sdks/go.sum @@ -1423,8 +1423,8 @@ golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20220819030929-7fc1605a5dde/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220929204114-8fcdb60fdcc0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M= -golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= +golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190130150945-aca44879d564/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= @@ -1540,8 +1540,8 @@ golang.org/x/text v0.6.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.8.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= -golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4= -golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI= +golang.org/x/text v0.17.0 h1:XtiM5bkSOt+ewxlOE/aE/AKEHibwj/6gvWMl9Rsh0Qc= +golang.org/x/text v0.17.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= From 81ad4fee378a586701dc0ff25bcc3c5cd7a9f3f6 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 7 Aug 2024 10:08:42 -0400 Subject: [PATCH 28/78] Bump github.com/aws/aws-sdk-go-v2/credentials in /sdks (#32096) Bumps [github.com/aws/aws-sdk-go-v2/credentials](https://github.com/aws/aws-sdk-go-v2) from 1.17.18 to 1.17.27. - [Release notes](https://github.com/aws/aws-sdk-go-v2/releases) - [Commits](https://github.com/aws/aws-sdk-go-v2/compare/credentials/v1.17.18...credentials/v1.17.27) --- updated-dependencies: - dependency-name: github.com/aws/aws-sdk-go-v2/credentials dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- sdks/go.mod | 23 +++++++++++------------ sdks/go.sum | 45 ++++++++++++++++++++++----------------------- 2 files changed, 33 insertions(+), 35 deletions(-) diff --git a/sdks/go.mod b/sdks/go.mod index 958a228c3546b..a5ad9f3b7f5c5 100644 --- a/sdks/go.mod +++ b/sdks/go.mod @@ -30,12 +30,12 @@ require ( cloud.google.com/go/pubsub v1.39.0 cloud.google.com/go/spanner v1.63.0 cloud.google.com/go/storage v1.43.0 - github.com/aws/aws-sdk-go-v2 v1.30.0 + github.com/aws/aws-sdk-go-v2 v1.30.3 github.com/aws/aws-sdk-go-v2/config v1.27.4 - github.com/aws/aws-sdk-go-v2/credentials v1.17.18 + github.com/aws/aws-sdk-go-v2/credentials v1.17.27 github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.13.8 github.com/aws/aws-sdk-go-v2/service/s3 v1.42.2 - github.com/aws/smithy-go v1.20.2 + github.com/aws/smithy-go v1.20.3 github.com/docker/go-connections v0.5.0 github.com/dustin/go-humanize v1.0.1 github.com/go-sql-driver/mysql v1.8.1 @@ -122,18 +122,18 @@ require ( github.com/apache/thrift v0.17.0 // indirect github.com/aws/aws-sdk-go v1.34.0 // indirect github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.5.1 // indirect - github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.5 // indirect - github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.9 // indirect - github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.9 // indirect + github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.11 // indirect + github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.15 // indirect + github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.15 // indirect github.com/aws/aws-sdk-go-v2/internal/ini v1.8.0 // indirect github.com/aws/aws-sdk-go-v2/internal/v4a v1.2.3 // indirect - github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.11.2 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.11.3 // indirect github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.2.3 // indirect - github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.11.11 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.11.17 // indirect github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.16.3 // indirect - github.com/aws/aws-sdk-go-v2/service/sso v1.20.11 // indirect - github.com/aws/aws-sdk-go-v2/service/ssooidc v1.24.5 // indirect - github.com/aws/aws-sdk-go-v2/service/sts v1.28.12 // indirect + github.com/aws/aws-sdk-go-v2/service/sso v1.22.4 // indirect + github.com/aws/aws-sdk-go-v2/service/ssooidc v1.26.4 // indirect + github.com/aws/aws-sdk-go-v2/service/sts v1.30.3 // indirect github.com/cenkalti/backoff/v4 v4.2.1 // indirect github.com/census-instrumentation/opencensus-proto v0.4.1 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect @@ -158,7 +158,6 @@ require ( github.com/gorilla/handlers v1.5.2 // indirect github.com/gorilla/mux v1.8.1 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect - github.com/jmespath/go-jmespath v0.4.0 // indirect github.com/klauspost/compress v1.17.8 // indirect github.com/klauspost/cpuid/v2 v2.2.6 // indirect github.com/magiconair/properties v1.8.7 // indirect diff --git a/sdks/go.sum b/sdks/go.sum index ef3c436f22e02..6afc175732ff3 100644 --- a/sdks/go.sum +++ b/sdks/go.sum @@ -677,39 +677,39 @@ github.com/aws/aws-sdk-go v1.30.19/go.mod h1:5zCpMtNQVjRREroY7sYe8lOMRSxkhG6MZve github.com/aws/aws-sdk-go v1.34.0 h1:brux2dRrlwCF5JhTL7MUT3WUwo9zfDHZZp3+g3Mvlmo= github.com/aws/aws-sdk-go v1.34.0/go.mod h1:5zCpMtNQVjRREroY7sYe8lOMRSxkhG6MZveU8YkpAk0= github.com/aws/aws-sdk-go-v2 v1.7.1/go.mod h1:L5LuPC1ZgDr2xQS7AmIec/Jlc7O/Y1u2KxJyNVab250= -github.com/aws/aws-sdk-go-v2 v1.30.0 h1:6qAwtzlfcTtcL8NHtbDQAqgM5s6NDipQTkPxyH/6kAA= -github.com/aws/aws-sdk-go-v2 v1.30.0/go.mod h1:ffIFB97e2yNsv4aTSGkqtHnppsIJzw7G7BReUZ3jCXM= +github.com/aws/aws-sdk-go-v2 v1.30.3 h1:jUeBtG0Ih+ZIFH0F4UkmL9w3cSpaMv9tYYDbzILP8dY= +github.com/aws/aws-sdk-go-v2 v1.30.3/go.mod h1:nIQjQVp5sfpQcTc9mPSr1B0PaWK5ByX9MOoDadSN4lc= github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.5.1 h1:ZY3108YtBNq96jNZTICHxN1gSBSbnvIdYwwqnvCV4Mc= github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.5.1/go.mod h1:t8PYl/6LzdAqsU4/9tz28V/kU+asFePvpOMkdul0gEQ= github.com/aws/aws-sdk-go-v2/config v1.5.0/go.mod h1:RWlPOAW3E3tbtNAqTwvSW54Of/yP3oiZXMI0xfUdjyA= github.com/aws/aws-sdk-go-v2/config v1.27.4 h1:AhfWb5ZwimdsYTgP7Od8E9L1u4sKmDW2ZVeLcf2O42M= github.com/aws/aws-sdk-go-v2/config v1.27.4/go.mod h1:zq2FFXK3A416kiukwpsd+rD4ny6JC7QSkp4QdN1Mp2g= github.com/aws/aws-sdk-go-v2/credentials v1.3.1/go.mod h1:r0n73xwsIVagq8RsxmZbGSRQFj9As3je72C2WzUIToc= -github.com/aws/aws-sdk-go-v2/credentials v1.17.18 h1:D/ALDWqK4JdY3OFgA2thcPO1c9aYTT5STS/CvnkqY1c= -github.com/aws/aws-sdk-go-v2/credentials v1.17.18/go.mod h1:JuitCWq+F5QGUrmMPsk945rop6bB57jdscu+Glozdnc= +github.com/aws/aws-sdk-go-v2/credentials v1.17.27 h1:2raNba6gr2IfA0eqqiP2XiQ0UVOpGPgDSi0I9iAP+UI= +github.com/aws/aws-sdk-go-v2/credentials v1.17.27/go.mod h1:gniiwbGahQByxan6YjQUMcW4Aov6bLC3m+evgcoN4r4= github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.3.0/go.mod h1:2LAuqPx1I6jNfaGDucWfA2zqQCYCOMCDHiCOciALyNw= -github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.5 h1:dDgptDO9dxeFkXy+tEgVkzSClHZje/6JkPW5aZyEvrQ= -github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.5/go.mod h1:gjvE2KBUgUQhcv89jqxrIxH9GaKs1JbZzWejj/DaHGA= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.11 h1:KreluoV8FZDEtI6Co2xuNk/UqI9iwMrOx/87PBNIKqw= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.11/go.mod h1:SeSUYBLsMYFoRvHE0Tjvn7kbxaUhl75CJi1sbfhMxkU= github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.3.2/go.mod h1:qaqQiHSrOUVOfKe6fhgQ6UzhxjwqVW8aHNegd6Ws4w4= github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.13.8 h1:wuOjvalpd2CnXffks74Vq6n3yv9vunKCoy4R1sjStGk= github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.13.8/go.mod h1:vywwjy6VnrR48Izg136JoSUXC4mH9QeUi3g0EH9DSrA= -github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.9 h1:cy8ahBJuhtM8GTTSyOkfy6WVPV1IE+SS5/wfXUYuulw= -github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.9/go.mod h1:CZBXGLaJnEZI6EVNcPd7a6B5IC5cA/GkRWtu9fp3S6Y= -github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.9 h1:A4SYk07ef04+vxZToz9LWvAXl9LW0NClpPpMsi31cz0= -github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.9/go.mod h1:5jJcHuwDagxN+ErjQ3PU3ocf6Ylc/p9x+BLO/+X4iXw= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.15 h1:SoNJ4RlFEQEbtDcCEt+QG56MY4fm4W8rYirAmq+/DdU= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.15/go.mod h1:U9ke74k1n2bf+RIgoX1SXFed1HLs51OgUSs+Ph0KJP8= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.15 h1:C6WHdGnTDIYETAm5iErQUiVNsclNx9qbJVPIt03B6bI= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.15/go.mod h1:ZQLZqhcu+JhSrA9/NXRm8SkDvsycE+JkV3WGY41e+IM= github.com/aws/aws-sdk-go-v2/internal/ini v1.1.1/go.mod h1:Zy8smImhTdOETZqfyn01iNOe0CNggVbPjCajyaz6Gvg= github.com/aws/aws-sdk-go-v2/internal/ini v1.8.0 h1:hT8rVHwugYE2lEfdFE0QWVo81lF7jMrYJVDWI+f+VxU= github.com/aws/aws-sdk-go-v2/internal/ini v1.8.0/go.mod h1:8tu/lYfQfFe6IGnaOdrpVgEL2IrrDOf6/m9RQum4NkY= github.com/aws/aws-sdk-go-v2/internal/v4a v1.2.3 h1:lMwCXiWJlrtZot0NJTjbC8G9zl+V3i68gBTBBvDeEXA= github.com/aws/aws-sdk-go-v2/internal/v4a v1.2.3/go.mod h1:5yzAuE9i2RkVAttBl8yxZgQr5OCq4D5yDnG7j9x2L0U= github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.2.1/go.mod h1:v33JQ57i2nekYTA70Mb+O18KeH4KqhdqxTJZNK1zdRE= -github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.11.2 h1:Ji0DY1xUsUr3I8cHps0G+XM3WWU16lP6yG8qu1GAZAs= -github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.11.2/go.mod h1:5CsjAbs3NlGQyZNFACh+zztPDI7fU6eW9QsxjfnuBKg= +github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.11.3 h1:dT3MqvGhSoaIhRseqw2I0yH81l7wiR2vjs57O51EAm8= +github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.11.3/go.mod h1:GlAeCkHwugxdHaueRr4nhPuY+WW+gR8UjlcqzPr1SPI= github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.2.3 h1:xbwRyCy7kXrOj89iIKLB6NfE2WCpP9HoKyk8dMDvnIQ= github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.2.3/go.mod h1:R+/S1O4TYpcktbVwddeOYg+uwUfLhADP2S/x4QwsCTM= github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.2.1/go.mod h1:zceowr5Z1Nh2WVP8bf/3ikB41IZW59E4yIYbg+pC6mw= -github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.11.11 h1:o4T+fKxA3gTMcluBNZZXE9DNaMkJuUL1O3mffCUjoJo= -github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.11.11/go.mod h1:84oZdJ+VjuJKs9v1UTC9NaodRZRseOXCTgku+vQJWR8= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.11.17 h1:HGErhhrxZlQ044RiM+WdoZxp0p+EGM62y3L6pwA4olE= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.11.17/go.mod h1:RkZEx4l0EHYDJpWppMJ3nD9wZJAa8/0lq9aVC+r2UII= github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.5.1/go.mod h1:6EQZIwNNvHpq/2/QSJnp4+ECvqIy55w95Ofs0ze+nGQ= github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.16.3 h1:KV0z2RDc7euMtg8aUT1czv5p29zcLlXALNFsd3jkkEc= github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.16.3/go.mod h1:KZgs2ny8HsxRIRbDwgvJcHHBZPOzQr/+NtGwnP+w2ec= @@ -717,16 +717,16 @@ github.com/aws/aws-sdk-go-v2/service/s3 v1.11.1/go.mod h1:XLAGFrEjbvMCLvAtWLLP32 github.com/aws/aws-sdk-go-v2/service/s3 v1.42.2 h1:NnduxUd9+Fq9DcCDdJK8v6l9lR1xDX4usvog+JuQAno= github.com/aws/aws-sdk-go-v2/service/s3 v1.42.2/go.mod h1:NXRKkiRF+erX2hnybnVU660cYT5/KChRD4iUgJ97cI8= github.com/aws/aws-sdk-go-v2/service/sso v1.3.1/go.mod h1:J3A3RGUvuCZjvSuZEcOpHDnzZP/sKbhDWV2T1EOzFIM= -github.com/aws/aws-sdk-go-v2/service/sso v1.20.11 h1:gEYM2GSpr4YNWc6hCd5nod4+d4kd9vWIAWrmGuLdlMw= -github.com/aws/aws-sdk-go-v2/service/sso v1.20.11/go.mod h1:gVvwPdPNYehHSP9Rs7q27U1EU+3Or2ZpXvzAYJNh63w= -github.com/aws/aws-sdk-go-v2/service/ssooidc v1.24.5 h1:iXjh3uaH3vsVcnyZX7MqCoCfcyxIrVE9iOQruRaWPrQ= -github.com/aws/aws-sdk-go-v2/service/ssooidc v1.24.5/go.mod h1:5ZXesEuy/QcO0WUnt+4sDkxhdXRHTu2yG0uCSH8B6os= +github.com/aws/aws-sdk-go-v2/service/sso v1.22.4 h1:BXx0ZIxvrJdSgSvKTZ+yRBeSqqgPM89VPlulEcl37tM= +github.com/aws/aws-sdk-go-v2/service/sso v1.22.4/go.mod h1:ooyCOXjvJEsUw7x+ZDHeISPMhtwI3ZCB7ggFMcFfWLU= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.26.4 h1:yiwVzJW2ZxZTurVbYWA7QOrAaCYQR72t0wrSBfoesUE= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.26.4/go.mod h1:0oxfLkpz3rQ/CHlx5hB7H69YUpFiI1tql6Q6Ne+1bCw= github.com/aws/aws-sdk-go-v2/service/sts v1.6.0/go.mod h1:q7o0j7d7HrJk/vr9uUt3BVRASvcU7gYZB9PUgPiByXg= -github.com/aws/aws-sdk-go-v2/service/sts v1.28.12 h1:M/1u4HBpwLuMtjlxuI2y6HoVLzF5e2mfxHCg7ZVMYmk= -github.com/aws/aws-sdk-go-v2/service/sts v1.28.12/go.mod h1:kcfd+eTdEi/40FIbLq4Hif3XMXnl5b/+t/KTfLt9xIk= +github.com/aws/aws-sdk-go-v2/service/sts v1.30.3 h1:ZsDKRLXGWHk8WdtyYMoGNO7bTudrvuKpDKgMVRlepGE= +github.com/aws/aws-sdk-go-v2/service/sts v1.30.3/go.mod h1:zwySh8fpFyXp9yOr/KVzxOl8SRqgf/IDw5aUt9UKFcQ= github.com/aws/smithy-go v1.6.0/go.mod h1:SObp3lf9smib00L/v3U2eAKG8FyQ7iLrJnQiAmR5n+E= -github.com/aws/smithy-go v1.20.2 h1:tbp628ireGtzcHDDmLT/6ADHidqnwgF57XOXZe6tp4Q= -github.com/aws/smithy-go v1.20.2/go.mod h1:krry+ya/rV9RDcV/Q16kpu6ypI4K2czasz0NC3qS14E= +github.com/aws/smithy-go v1.20.3 h1:ryHwveWzPV5BIof6fyDvor6V3iUL7nTfiTKXHiW05nE= +github.com/aws/smithy-go v1.20.3/go.mod h1:krry+ya/rV9RDcV/Q16kpu6ypI4K2czasz0NC3qS14E= github.com/boombuler/barcode v1.0.0/go.mod h1:paBWMcWSl3LHKBqUq+rly7CNSldXjb2rDl3JlRe0mD8= github.com/boombuler/barcode v1.0.1/go.mod h1:paBWMcWSl3LHKBqUq+rly7CNSldXjb2rDl3JlRe0mD8= github.com/cenkalti/backoff/v4 v4.2.1 h1:y4OZtCnogmCPw98Zjyt5a6+QwPLGkiQsYW5oUqylYbM= @@ -1003,7 +1003,6 @@ github.com/jmespath/go-jmespath v0.0.0-20180206201540-c2b33e8439af/go.mod h1:Nht github.com/jmespath/go-jmespath v0.3.0/go.mod h1:9QtRXoHjLGCJ5IBSaohpXITPlowMeeYCZ7fLUTSywik= github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg= github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo= -github.com/jmespath/go-jmespath/internal/testify v1.5.1 h1:shLQSRRSCCPj3f2gpwzGwWFoC7ycTf1rcQZHOlsJ6N8= github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U= github.com/johannesboyne/gofakes3 v0.0.0-20221110173912-32fb85c5aed6 h1:eQGUsj2LcsLzfrHY1noKDSU7h+c9/rw9pQPwbQ9g1jQ= github.com/johannesboyne/gofakes3 v0.0.0-20221110173912-32fb85c5aed6/go.mod h1:LIAXxPvcUXwOcTIj9LSNSUpE9/eMHalTWxsP/kmWxQI= From 828717a71d638664ba12cad5c0c00193bb1cde35 Mon Sep 17 00:00:00 2001 From: Vatsal <36672090+imvtsl@users.noreply.github.com> Date: Wed, 7 Aug 2024 09:54:15 -0700 Subject: [PATCH 29/78] [#21515][Go SDK] Update go protobuf package to new version (#32045) --- CHANGES.md | 1 + sdks/go.mod | 2 +- sdks/go/cmd/beamctl/cmd/provision.go | 3 +-- sdks/go/container/boot_test.go | 2 +- sdks/go/container/tools/provision.go | 13 +++++++++---- sdks/go/pkg/beam/artifact/gcsproxy/retrieval.go | 2 +- sdks/go/pkg/beam/artifact/gcsproxy/staging.go | 2 +- sdks/go/pkg/beam/artifact/materialize.go | 2 +- sdks/go/pkg/beam/artifact/materialize_test.go | 2 +- sdks/go/pkg/beam/coder.go | 12 ++++++------ sdks/go/pkg/beam/core/runtime/exec/translate.go | 2 +- sdks/go/pkg/beam/core/runtime/graphx/coder.go | 8 ++++---- .../pkg/beam/core/runtime/graphx/schema/schema.go | 2 +- .../beam/core/runtime/graphx/schema/schema_test.go | 3 +-- sdks/go/pkg/beam/core/runtime/graphx/translate.go | 8 ++++---- .../pkg/beam/core/runtime/graphx/translate_test.go | 10 +++++----- .../pkg/beam/core/runtime/harness/harness_test.go | 2 +- sdks/go/pkg/beam/core/runtime/harness/statemgr.go | 5 ++--- .../pkg/beam/core/runtime/pipelinex/clone_test.go | 2 +- sdks/go/pkg/beam/core/runtime/pipelinex/replace.go | 2 +- .../pkg/beam/core/runtime/pipelinex/replace_test.go | 2 +- sdks/go/pkg/beam/core/runtime/pipelinex/util.go | 2 +- .../go/pkg/beam/core/runtime/xlangx/resolve_test.go | 2 +- sdks/go/pkg/beam/core/util/protox/any.go | 6 +++--- sdks/go/pkg/beam/core/util/protox/any_test.go | 4 ++-- sdks/go/pkg/beam/core/util/protox/base64.go | 2 +- sdks/go/pkg/beam/core/util/protox/protox.go | 2 +- sdks/go/pkg/beam/create_test.go | 6 ++---- sdks/go/pkg/beam/provision/provision.go | 2 +- sdks/go/pkg/beam/runners/dataflow/dataflow.go | 3 +-- .../beam/runners/dataflow/dataflowlib/execute.go | 3 +-- sdks/go/pkg/beam/runners/universal/runnerlib/job.go | 3 +-- .../pkg/beam/runners/universal/runnerlib/stage.go | 2 +- sdks/go/pkg/beam/runners/universal/universal.go | 3 +-- .../go/pkg/beam/transforms/xlang/schema/external.go | 2 +- sdks/java/container/boot.go | 7 +++---- sdks/python/container/boot.go | 12 ++++++------ 37 files changed, 72 insertions(+), 76 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 129fa01f94a84..d082f03fd310e 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -69,6 +69,7 @@ * X feature added (Java/Python) ([#X](https://github.com/apache/beam/issues/X)). * Go SDK Minimum Go Version updated to 1.21 ([#32092](https://github.com/apache/beam/pull/32092)). +* Updated Go protobuf package to new version (Go) ([#21515](https://github.com/apache/beam/issues/21515)). ## Breaking Changes diff --git a/sdks/go.mod b/sdks/go.mod index a5ad9f3b7f5c5..fb0b7f85f3dea 100644 --- a/sdks/go.mod +++ b/sdks/go.mod @@ -39,7 +39,6 @@ require ( github.com/docker/go-connections v0.5.0 github.com/dustin/go-humanize v1.0.1 github.com/go-sql-driver/mysql v1.8.1 - github.com/golang/protobuf v1.5.4 // TODO(danoliveira): Fully replace this with google.golang.org/protobuf github.com/google/go-cmp v0.6.0 github.com/google/uuid v1.6.0 github.com/johannesboyne/gofakes3 v0.0.0-20221110173912-32fb85c5aed6 @@ -88,6 +87,7 @@ require ( github.com/go-logr/logr v1.4.1 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/go-ole/go-ole v1.2.6 // indirect + github.com/golang/protobuf v1.5.4 // indirect github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect github.com/minio/highwayhash v1.0.2 // indirect github.com/moby/docker-image-spec v1.3.1 // indirect diff --git a/sdks/go/cmd/beamctl/cmd/provision.go b/sdks/go/cmd/beamctl/cmd/provision.go index cab82f7bf9db8..878c9a77da82f 100644 --- a/sdks/go/cmd/beamctl/cmd/provision.go +++ b/sdks/go/cmd/beamctl/cmd/provision.go @@ -17,7 +17,6 @@ package cmd import ( fnpb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/fnexecution_v1" - "github.com/golang/protobuf/proto" "github.com/spf13/cobra" ) @@ -53,6 +52,6 @@ func infoFn(cmd *cobra.Command, args []string) error { return err } - cmd.Print(proto.MarshalTextString(info.GetInfo())) + cmd.Print(info.GetInfo().String()) return nil } diff --git a/sdks/go/container/boot_test.go b/sdks/go/container/boot_test.go index e799e5d65b0cc..49c78047249e5 100644 --- a/sdks/go/container/boot_test.go +++ b/sdks/go/container/boot_test.go @@ -25,7 +25,7 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam/artifact" fnpb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/fnexecution_v1" pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1" - "github.com/golang/protobuf/proto" + "google.golang.org/protobuf/proto" ) func TestEnsureEndpointsSet_AllSet(t *testing.T) { diff --git a/sdks/go/container/tools/provision.go b/sdks/go/container/tools/provision.go index dab3383fc1710..6b370a5c2e663 100644 --- a/sdks/go/container/tools/provision.go +++ b/sdks/go/container/tools/provision.go @@ -29,8 +29,8 @@ import ( fnpb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/fnexecution_v1" "github.com/apache/beam/sdks/v2/go/pkg/beam/util/grpcx" - "github.com/golang/protobuf/jsonpb" - google_pb "github.com/golang/protobuf/ptypes/struct" + "google.golang.org/protobuf/encoding/protojson" + google_pb "google.golang.org/protobuf/types/known/structpb" ) // ProvisionInfo returns the runtime provisioning info for the worker. @@ -65,7 +65,8 @@ func OptionsToProto(v any) (*google_pb.Struct, error) { // JSONToProto converts JSON-encoded pipeline options to a proto struct. func JSONToProto(data string) (*google_pb.Struct, error) { var out google_pb.Struct - if err := jsonpb.UnmarshalString(string(data), &out); err != nil { + + if err := protojson.Unmarshal([]byte(data), &out); err != nil { return nil, err } return &out, nil @@ -85,5 +86,9 @@ func ProtoToJSON(opt *google_pb.Struct) (string, error) { if opt == nil { return "{}", nil } - return (&jsonpb.Marshaler{}).MarshalToString(opt) + bytes, err := protojson.Marshal(opt) + if err != nil { + return "", err + } + return string(bytes), err } diff --git a/sdks/go/pkg/beam/artifact/gcsproxy/retrieval.go b/sdks/go/pkg/beam/artifact/gcsproxy/retrieval.go index 15c2d9e2954a4..ceb8a319be982 100644 --- a/sdks/go/pkg/beam/artifact/gcsproxy/retrieval.go +++ b/sdks/go/pkg/beam/artifact/gcsproxy/retrieval.go @@ -22,8 +22,8 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam/internal/errors" jobpb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/jobmanagement_v1" "github.com/apache/beam/sdks/v2/go/pkg/beam/util/gcsx" - "github.com/golang/protobuf/proto" "golang.org/x/net/context" + "google.golang.org/protobuf/proto" ) // RetrievalServer is a artifact retrieval server backed by Google diff --git a/sdks/go/pkg/beam/artifact/gcsproxy/staging.go b/sdks/go/pkg/beam/artifact/gcsproxy/staging.go index a295084398078..9113e780f3391 100644 --- a/sdks/go/pkg/beam/artifact/gcsproxy/staging.go +++ b/sdks/go/pkg/beam/artifact/gcsproxy/staging.go @@ -28,8 +28,8 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam/internal/errors" jobpb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/jobmanagement_v1" "github.com/apache/beam/sdks/v2/go/pkg/beam/util/gcsx" - "github.com/golang/protobuf/proto" "golang.org/x/net/context" + "google.golang.org/protobuf/proto" ) // StagingServer is a artifact staging server backed by Google Cloud Storage diff --git a/sdks/go/pkg/beam/artifact/materialize.go b/sdks/go/pkg/beam/artifact/materialize.go index 866e0dd99b9fa..624e30efcd2b3 100644 --- a/sdks/go/pkg/beam/artifact/materialize.go +++ b/sdks/go/pkg/beam/artifact/materialize.go @@ -38,7 +38,7 @@ import ( pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1" "github.com/apache/beam/sdks/v2/go/pkg/beam/util/errorx" "github.com/apache/beam/sdks/v2/go/pkg/beam/util/grpcx" - "github.com/golang/protobuf/proto" + "google.golang.org/protobuf/proto" ) // TODO(lostluck): 2018/05/28 Extract these from their enum descriptors in the pipeline_v1 proto diff --git a/sdks/go/pkg/beam/artifact/materialize_test.go b/sdks/go/pkg/beam/artifact/materialize_test.go index 35223c908b773..31890ed045cc8 100644 --- a/sdks/go/pkg/beam/artifact/materialize_test.go +++ b/sdks/go/pkg/beam/artifact/materialize_test.go @@ -29,9 +29,9 @@ import ( jobpb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/jobmanagement_v1" pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1" "github.com/apache/beam/sdks/v2/go/pkg/beam/util/grpcx" - "github.com/golang/protobuf/proto" "google.golang.org/grpc" "google.golang.org/grpc/metadata" + "google.golang.org/protobuf/proto" ) // TestRetrieve tests that we can successfully retrieve fresh files. diff --git a/sdks/go/pkg/beam/coder.go b/sdks/go/pkg/beam/coder.go index 062bb337e8d81..b03b739ed7be4 100644 --- a/sdks/go/pkg/beam/coder.go +++ b/sdks/go/pkg/beam/coder.go @@ -30,8 +30,8 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam/core/util/jsonx" "github.com/apache/beam/sdks/v2/go/pkg/beam/core/util/reflectx" "github.com/apache/beam/sdks/v2/go/pkg/beam/internal/errors" - protov1 "github.com/golang/protobuf/proto" protov2 "google.golang.org/protobuf/proto" + "google.golang.org/protobuf/protoadapt" "google.golang.org/protobuf/reflect/protoreflect" ) @@ -51,7 +51,7 @@ type jsonCoder interface { json.Unmarshaler } -var protoMessageType = reflect.TypeOf((*protov1.Message)(nil)).Elem() +var protoMessageType = reflect.TypeOf((*protoadapt.MessageV1)(nil)).Elem() var protoReflectMessageType = reflect.TypeOf((*protoreflect.ProtoMessage)(nil)).Elem() var jsonCoderType = reflect.TypeOf((*jsonCoder)(nil)).Elem() @@ -276,8 +276,8 @@ func protoEnc(in T) ([]byte, error) { switch it := in.(type) { case protoreflect.ProtoMessage: p = it - case protov1.Message: - p = protov1.MessageV2(it) + case protoadapt.MessageV1: + p = protoadapt.MessageV2Of(it) } b, err := protov2.MarshalOptions{Deterministic: true}.Marshal(p) if err != nil { @@ -293,8 +293,8 @@ func protoDec(t reflect.Type, in []byte) (T, error) { switch it := reflect.New(t.Elem()).Interface().(type) { case protoreflect.ProtoMessage: p = it - case protov1.Message: - p = protov1.MessageV2(it) + case protoadapt.MessageV1: + p = protoadapt.MessageV2Of(it) } err := protov2.UnmarshalOptions{}.Unmarshal(in, p) if err != nil { diff --git a/sdks/go/pkg/beam/core/runtime/exec/translate.go b/sdks/go/pkg/beam/core/runtime/exec/translate.go index 72af9e80c4052..b74ede228fd97 100644 --- a/sdks/go/pkg/beam/core/runtime/exec/translate.go +++ b/sdks/go/pkg/beam/core/runtime/exec/translate.go @@ -33,7 +33,7 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam/internal/errors" fnpb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/fnexecution_v1" pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1" - "github.com/golang/protobuf/proto" + "google.golang.org/protobuf/proto" ) // TODO(lostluck): 2018/05/28 Extract these from the canonical enums in beam_runner_api.proto diff --git a/sdks/go/pkg/beam/core/runtime/graphx/coder.go b/sdks/go/pkg/beam/core/runtime/graphx/coder.go index 87b3771e5756a..99ca5517d3d39 100644 --- a/sdks/go/pkg/beam/core/runtime/graphx/coder.go +++ b/sdks/go/pkg/beam/core/runtime/graphx/coder.go @@ -27,7 +27,7 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam/core/util/protox" "github.com/apache/beam/sdks/v2/go/pkg/beam/internal/errors" pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1" - "github.com/golang/protobuf/proto" + "google.golang.org/protobuf/proto" ) const ( @@ -615,8 +615,8 @@ func (b *CoderMarshaller) internRowCoder(schema *pipepb.Schema) string { } func (b *CoderMarshaller) internCoder(coder *pipepb.Coder) string { - key := proto.MarshalTextString(coder) - if id, exists := b.coder2id[key]; exists { + key := coder.String() + if id, exists := b.coder2id[(key)]; exists { return id } @@ -626,7 +626,7 @@ func (b *CoderMarshaller) internCoder(coder *pipepb.Coder) string { } else { id = fmt.Sprintf("c%v@%v", len(b.coder2id), b.Namespace) } - b.coder2id[key] = id + b.coder2id[string(key)] = id b.coders[id] = coder return id } diff --git a/sdks/go/pkg/beam/core/runtime/graphx/schema/schema.go b/sdks/go/pkg/beam/core/runtime/graphx/schema/schema.go index fdd9355e1cb81..0d44e68285b55 100644 --- a/sdks/go/pkg/beam/core/runtime/graphx/schema/schema.go +++ b/sdks/go/pkg/beam/core/runtime/graphx/schema/schema.go @@ -37,8 +37,8 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam/core/util/reflectx" "github.com/apache/beam/sdks/v2/go/pkg/beam/internal/errors" pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1" - "github.com/golang/protobuf/proto" "github.com/google/uuid" + "google.golang.org/protobuf/proto" ) // Initialize registered schemas. For use by the beam package at beam.Init time. diff --git a/sdks/go/pkg/beam/core/runtime/graphx/schema/schema_test.go b/sdks/go/pkg/beam/core/runtime/graphx/schema/schema_test.go index 37b3e79f8f504..367d70e81d174 100644 --- a/sdks/go/pkg/beam/core/runtime/graphx/schema/schema_test.go +++ b/sdks/go/pkg/beam/core/runtime/graphx/schema/schema_test.go @@ -24,7 +24,6 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam/core/runtime" pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1" - "github.com/golang/protobuf/proto" "github.com/google/go-cmp/cmp" "google.golang.org/protobuf/encoding/prototext" "google.golang.org/protobuf/testing/protocmp" @@ -806,7 +805,7 @@ func TestSchemaConversion(t *testing.T) { } if d := cmp.Diff(test.st, got, protocmp.Transform(), - protocmp.IgnoreFields(proto.MessageV2(&pipepb.Schema{}), "id"), + protocmp.IgnoreFields(&pipepb.Schema{}, "id"), ); d != "" { t.Errorf("diff (-want, +got): %v", d) } diff --git a/sdks/go/pkg/beam/core/runtime/graphx/translate.go b/sdks/go/pkg/beam/core/runtime/graphx/translate.go index b05292546133a..65280ef6b9303 100644 --- a/sdks/go/pkg/beam/core/runtime/graphx/translate.go +++ b/sdks/go/pkg/beam/core/runtime/graphx/translate.go @@ -34,7 +34,7 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam/internal/errors" pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1" "github.com/apache/beam/sdks/v2/go/pkg/beam/options/resource" - "github.com/golang/protobuf/proto" + "google.golang.org/protobuf/proto" "google.golang.org/protobuf/types/known/durationpb" ) @@ -1209,13 +1209,13 @@ func (m *marshaller) addWindowingStrategy(w *window.WindowingStrategy) (string, } func (m *marshaller) internWindowingStrategy(w *pipepb.WindowingStrategy) string { - key := proto.MarshalTextString(w) - if id, exists := m.windowing2id[key]; exists { + key := w.String() + if id, exists := m.windowing2id[(key)]; exists { return id } id := fmt.Sprintf("w%v", len(m.windowing2id)) - m.windowing2id[key] = id + m.windowing2id[string(key)] = id m.windowing[id] = w return id } diff --git a/sdks/go/pkg/beam/core/runtime/graphx/translate_test.go b/sdks/go/pkg/beam/core/runtime/graphx/translate_test.go index a331aedd585de..e18a5f97796b3 100644 --- a/sdks/go/pkg/beam/core/runtime/graphx/translate_test.go +++ b/sdks/go/pkg/beam/core/runtime/graphx/translate_test.go @@ -34,8 +34,8 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam/core/util/protox" "github.com/apache/beam/sdks/v2/go/pkg/beam/core/util/reflectx" pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1" - "github.com/golang/protobuf/proto" "github.com/google/go-cmp/cmp" + "google.golang.org/protobuf/proto" ) func init() { @@ -181,13 +181,13 @@ func TestMarshal(t *testing.T) { } if got, want := len(p.GetComponents().GetTransforms()), test.transforms; got != want { - t.Errorf("got %d transforms, want %d : %v", got, want, proto.MarshalTextString(p)) + t.Errorf("got %d transforms, want %d : %v", got, want, p.String()) } if got, want := len(p.GetRootTransformIds()), test.roots; got != want { - t.Errorf("got %d roots, want %d : %v", got, want, proto.MarshalTextString(p)) + t.Errorf("got %d roots, want %d : %v", got, want, p.String()) } if got, want := p.GetRequirements(), test.requirements; !cmp.Equal(got, want, cmpopts.SortSlices(func(a, b string) bool { return a < b })) { - t.Errorf("incorrect requirements: got %v, want %v : %v", got, want, proto.MarshalTextString(p)) + t.Errorf("incorrect requirements: got %v, want %v : %v", got, want, p.String()) } }) } @@ -248,7 +248,7 @@ func TestMarshal_PTransformAnnotations(t *testing.T) { pts := p.GetComponents().GetTransforms() if got, want := len(pts), test.transforms; got != want { - t.Errorf("got %d transforms, want %d : %v", got, want, proto.MarshalTextString(p)) + t.Errorf("got %d transforms, want %d : %v", got, want, p.String()) } for _, pt := range pts { // Context annotations only apply to composites, and are not duplicated to leaves. diff --git a/sdks/go/pkg/beam/core/runtime/harness/harness_test.go b/sdks/go/pkg/beam/core/runtime/harness/harness_test.go index 91dd3c591d5b3..8c25db613eba7 100644 --- a/sdks/go/pkg/beam/core/runtime/harness/harness_test.go +++ b/sdks/go/pkg/beam/core/runtime/harness/harness_test.go @@ -23,7 +23,7 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam/core/runtime/exec" fnpb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/fnexecution_v1" pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1" - "github.com/golang/protobuf/proto" + "google.golang.org/protobuf/proto" ) // validDescriptor describes a valid pipeline with a source and a sink, but doesn't do anything else. diff --git a/sdks/go/pkg/beam/core/runtime/harness/statemgr.go b/sdks/go/pkg/beam/core/runtime/harness/statemgr.go index 76d4e1f32c23a..061cfca011f55 100644 --- a/sdks/go/pkg/beam/core/runtime/harness/statemgr.go +++ b/sdks/go/pkg/beam/core/runtime/harness/statemgr.go @@ -28,7 +28,6 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam/internal/errors" "github.com/apache/beam/sdks/v2/go/pkg/beam/log" fnpb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/fnexecution_v1" - "github.com/golang/protobuf/proto" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" ) @@ -633,7 +632,7 @@ func (c *StateChannel) read(ctx context.Context) { if !ok { // This can happen if Send returns an error that write handles, but // the message was actually sent. - log.Errorf(ctx, "StateChannel[%v].read: no consumer for state response: %v", c.id, proto.MarshalTextString(msg)) + log.Errorf(ctx, "StateChannel[%v].read: no consumer for state response: %v", c.id, msg.String()) continue } @@ -641,7 +640,7 @@ func (c *StateChannel) read(ctx context.Context) { case ch <- msg: // ok default: - panic(fmt.Sprintf("StateChannel[%v].read: failed to consume state response: %v", c.id, proto.MarshalTextString(msg))) + panic(fmt.Sprintf("StateChannel[%v].read: failed to consume state response: %v", c.id, msg.String())) } } } diff --git a/sdks/go/pkg/beam/core/runtime/pipelinex/clone_test.go b/sdks/go/pkg/beam/core/runtime/pipelinex/clone_test.go index 695830a483c07..b58a309837978 100644 --- a/sdks/go/pkg/beam/core/runtime/pipelinex/clone_test.go +++ b/sdks/go/pkg/beam/core/runtime/pipelinex/clone_test.go @@ -19,8 +19,8 @@ import ( "testing" pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1" - "github.com/golang/protobuf/proto" "github.com/google/go-cmp/cmp" + "google.golang.org/protobuf/proto" ) func TestShallowClonePTransform(t *testing.T) { diff --git a/sdks/go/pkg/beam/core/runtime/pipelinex/replace.go b/sdks/go/pkg/beam/core/runtime/pipelinex/replace.go index cfcce88675bed..9e527f2fd3220 100644 --- a/sdks/go/pkg/beam/core/runtime/pipelinex/replace.go +++ b/sdks/go/pkg/beam/core/runtime/pipelinex/replace.go @@ -28,7 +28,7 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam/core/util/reflectx" "github.com/apache/beam/sdks/v2/go/pkg/beam/internal/errors" pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1" - "github.com/golang/protobuf/proto" + "google.golang.org/protobuf/proto" ) // Update merges a pipeline with the given components, which may add, replace diff --git a/sdks/go/pkg/beam/core/runtime/pipelinex/replace_test.go b/sdks/go/pkg/beam/core/runtime/pipelinex/replace_test.go index 79bfd43958aff..3024787e61631 100644 --- a/sdks/go/pkg/beam/core/runtime/pipelinex/replace_test.go +++ b/sdks/go/pkg/beam/core/runtime/pipelinex/replace_test.go @@ -20,8 +20,8 @@ import ( "testing" pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1" - "github.com/golang/protobuf/proto" "github.com/google/go-cmp/cmp" + "google.golang.org/protobuf/proto" "google.golang.org/protobuf/testing/protocmp" ) diff --git a/sdks/go/pkg/beam/core/runtime/pipelinex/util.go b/sdks/go/pkg/beam/core/runtime/pipelinex/util.go index 5fe9def9b2276..4735e7b77d206 100644 --- a/sdks/go/pkg/beam/core/runtime/pipelinex/util.go +++ b/sdks/go/pkg/beam/core/runtime/pipelinex/util.go @@ -19,7 +19,7 @@ import ( "sort" pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1" - "github.com/golang/protobuf/proto" + "google.golang.org/protobuf/proto" ) // Bounded returns true iff all PCollections are bounded. diff --git a/sdks/go/pkg/beam/core/runtime/xlangx/resolve_test.go b/sdks/go/pkg/beam/core/runtime/xlangx/resolve_test.go index 1f18b333541bd..eec13c451a135 100644 --- a/sdks/go/pkg/beam/core/runtime/xlangx/resolve_test.go +++ b/sdks/go/pkg/beam/core/runtime/xlangx/resolve_test.go @@ -20,7 +20,7 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam/core/graph" pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1" - "github.com/golang/protobuf/proto" + "google.golang.org/protobuf/proto" ) func createExternalEdge(typeUrn string, typePayload []byte) *graph.MultiEdge { diff --git a/sdks/go/pkg/beam/core/util/protox/any.go b/sdks/go/pkg/beam/core/util/protox/any.go index e539a8c19dec0..46bd08b1aff10 100644 --- a/sdks/go/pkg/beam/core/util/protox/any.go +++ b/sdks/go/pkg/beam/core/util/protox/any.go @@ -17,9 +17,9 @@ package protox import ( "github.com/apache/beam/sdks/v2/go/pkg/beam/internal/errors" - "github.com/golang/protobuf/proto" - protobuf "github.com/golang/protobuf/ptypes/any" - protobufw "github.com/golang/protobuf/ptypes/wrappers" + "google.golang.org/protobuf/proto" + protobuf "google.golang.org/protobuf/types/known/anypb" + protobufw "google.golang.org/protobuf/types/known/wrapperspb" ) const ( diff --git a/sdks/go/pkg/beam/core/util/protox/any_test.go b/sdks/go/pkg/beam/core/util/protox/any_test.go index 1975bec405cb8..9eb7621db3510 100644 --- a/sdks/go/pkg/beam/core/util/protox/any_test.go +++ b/sdks/go/pkg/beam/core/util/protox/any_test.go @@ -19,8 +19,8 @@ import ( "bytes" "testing" - "github.com/golang/protobuf/proto" - protobufw "github.com/golang/protobuf/ptypes/wrappers" + "google.golang.org/protobuf/proto" + protobufw "google.golang.org/protobuf/types/known/wrapperspb" ) func TestProtoPackingInvertibility(t *testing.T) { diff --git a/sdks/go/pkg/beam/core/util/protox/base64.go b/sdks/go/pkg/beam/core/util/protox/base64.go index 7f0f5a4bdeea1..79ea8a025f7c9 100644 --- a/sdks/go/pkg/beam/core/util/protox/base64.go +++ b/sdks/go/pkg/beam/core/util/protox/base64.go @@ -19,7 +19,7 @@ import ( "encoding/base64" "github.com/apache/beam/sdks/v2/go/pkg/beam/internal/errors" - "github.com/golang/protobuf/proto" + "google.golang.org/protobuf/proto" ) // MustEncodeBase64 encodes a proto wrapped in base64 and panics on failure. diff --git a/sdks/go/pkg/beam/core/util/protox/protox.go b/sdks/go/pkg/beam/core/util/protox/protox.go index 3555886eefc9a..892a2ba97d039 100644 --- a/sdks/go/pkg/beam/core/util/protox/protox.go +++ b/sdks/go/pkg/beam/core/util/protox/protox.go @@ -16,7 +16,7 @@ // Package protox contains utilities for working with protobufs. package protox -import "github.com/golang/protobuf/proto" +import "google.golang.org/protobuf/proto" // MustEncode encode the message and panics on failure. func MustEncode(msg proto.Message) []byte { diff --git a/sdks/go/pkg/beam/create_test.go b/sdks/go/pkg/beam/create_test.go index 785c3b33db621..e65fefc7f2d89 100644 --- a/sdks/go/pkg/beam/create_test.go +++ b/sdks/go/pkg/beam/create_test.go @@ -23,7 +23,7 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam" "github.com/apache/beam/sdks/v2/go/pkg/beam/testing/passert" "github.com/apache/beam/sdks/v2/go/pkg/beam/testing/ptest" - "github.com/golang/protobuf/proto" + "google.golang.org/protobuf/protoadapt" ) func TestMain(m *testing.M) { @@ -157,7 +157,5 @@ func (t *testProto) Unmarshal(b []byte) error { // Ensure testProto is detected as a proto.Message and can be (un)marshalled by // the proto library. var ( - _ proto.Message = &testProto{} - _ proto.Marshaler = &testProto{} - _ proto.Unmarshaler = &testProto{} + _ protoadapt.MessageV1 = &testProto{} ) diff --git a/sdks/go/pkg/beam/provision/provision.go b/sdks/go/pkg/beam/provision/provision.go index 3c36973535e79..58a8f5ee82928 100644 --- a/sdks/go/pkg/beam/provision/provision.go +++ b/sdks/go/pkg/beam/provision/provision.go @@ -24,7 +24,7 @@ import ( "github.com/apache/beam/sdks/v2/go/container/tools" fnpb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/fnexecution_v1" - google_pb "github.com/golang/protobuf/ptypes/struct" + google_pb "google.golang.org/protobuf/types/known/structpb" ) // Info returns the runtime provisioning info for the worker. diff --git a/sdks/go/pkg/beam/runners/dataflow/dataflow.go b/sdks/go/pkg/beam/runners/dataflow/dataflow.go index ca701979497ae..73667fb8ee6ee 100644 --- a/sdks/go/pkg/beam/runners/dataflow/dataflow.go +++ b/sdks/go/pkg/beam/runners/dataflow/dataflow.go @@ -47,7 +47,6 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam/runners/dataflow/dataflowlib" "github.com/apache/beam/sdks/v2/go/pkg/beam/util/gcsx" "github.com/apache/beam/sdks/v2/go/pkg/beam/x/hooks/perf" - "github.com/golang/protobuf/proto" ) // TODO(herohde) 5/16/2017: the Dataflow flags should match the other SDKs. @@ -235,7 +234,7 @@ func Execute(ctx context.Context, p *beam.Pipeline) (beam.PipelineResult, error) if *dryRun { log.Info(ctx, "Dry-run: not submitting job!") - log.Info(ctx, proto.MarshalTextString(model)) + log.Info(ctx, model.String()) job, err := dataflowlib.Translate(ctx, model, opts, workerURL, modelURL) if err != nil { return nil, err diff --git a/sdks/go/pkg/beam/runners/dataflow/dataflowlib/execute.go b/sdks/go/pkg/beam/runners/dataflow/dataflowlib/execute.go index 9a1641e314d12..806b8940ae994 100644 --- a/sdks/go/pkg/beam/runners/dataflow/dataflowlib/execute.go +++ b/sdks/go/pkg/beam/runners/dataflow/dataflowlib/execute.go @@ -30,7 +30,6 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam/log" pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1" "github.com/apache/beam/sdks/v2/go/pkg/beam/runners/universal/runnerlib" - "github.com/golang/protobuf/proto" df "google.golang.org/api/dataflow/v1b3" "google.golang.org/api/googleapi" ) @@ -82,7 +81,7 @@ func Execute(ctx context.Context, raw *pipepb.Pipeline, opts *JobOptions, worker } // (2) Upload model to GCS - log.Info(ctx, proto.MarshalTextString(raw)) + log.Info(ctx, raw.String()) if err := StageModel(ctx, opts.Project, modelURL, protox.MustEncode(raw)); err != nil { return presult, err diff --git a/sdks/go/pkg/beam/runners/universal/runnerlib/job.go b/sdks/go/pkg/beam/runners/universal/runnerlib/job.go index 4e50661b3db8e..7d6a3027e47e6 100644 --- a/sdks/go/pkg/beam/runners/universal/runnerlib/job.go +++ b/sdks/go/pkg/beam/runners/universal/runnerlib/job.go @@ -28,7 +28,6 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam/log" jobpb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/jobmanagement_v1" pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1" - "github.com/golang/protobuf/proto" ) // JobOptions capture the various options for submitting jobs @@ -152,7 +151,7 @@ func WaitForCompletion(ctx context.Context, client jobpb.JobServiceClient, jobID } default: - return errors.Errorf("unexpected job update: %v", proto.MarshalTextString(msg)) + return errors.Errorf("unexpected job update: %v", msg.String()) } } } diff --git a/sdks/go/pkg/beam/runners/universal/runnerlib/stage.go b/sdks/go/pkg/beam/runners/universal/runnerlib/stage.go index d5cc6aa7327a7..85d6fdc7e2ca4 100644 --- a/sdks/go/pkg/beam/runners/universal/runnerlib/stage.go +++ b/sdks/go/pkg/beam/runners/universal/runnerlib/stage.go @@ -29,8 +29,8 @@ import ( jobpb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/jobmanagement_v1" pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1" "github.com/apache/beam/sdks/v2/go/pkg/beam/util/grpcx" - "github.com/golang/protobuf/proto" "google.golang.org/grpc" + "google.golang.org/protobuf/proto" ) // Stage stages the worker binary and any additional files to the given diff --git a/sdks/go/pkg/beam/runners/universal/universal.go b/sdks/go/pkg/beam/runners/universal/universal.go index 8af9e91e1e15e..c63175c58578f 100644 --- a/sdks/go/pkg/beam/runners/universal/universal.go +++ b/sdks/go/pkg/beam/runners/universal/universal.go @@ -32,7 +32,6 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam/runners/universal/extworker" "github.com/apache/beam/sdks/v2/go/pkg/beam/runners/universal/runnerlib" "github.com/apache/beam/sdks/v2/go/pkg/beam/runners/vet" - "github.com/golang/protobuf/proto" ) func init() { @@ -93,7 +92,7 @@ func Execute(ctx context.Context, p *beam.Pipeline) (beam.PipelineResult, error) return nil, errors.WithContextf(err, "generating model pipeline") } - log.Info(ctx, proto.MarshalTextString(pipeline)) + log.Info(ctx, pipeline.String()) opt := &runnerlib.JobOptions{ Name: jobopts.GetJobName(), diff --git a/sdks/go/pkg/beam/transforms/xlang/schema/external.go b/sdks/go/pkg/beam/transforms/xlang/schema/external.go index 75be90cbe7b3a..55a858b9cf9ef 100644 --- a/sdks/go/pkg/beam/transforms/xlang/schema/external.go +++ b/sdks/go/pkg/beam/transforms/xlang/schema/external.go @@ -20,7 +20,7 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam" "github.com/apache/beam/sdks/v2/go/pkg/beam/core/runtime/xlangx" pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1" - "github.com/golang/protobuf/proto" + "google.golang.org/protobuf/proto" ) const schemaTransformURN = "beam:expansion:payload:schematransform:v1" diff --git a/sdks/java/container/boot.go b/sdks/java/container/boot.go index 14e2e4311b458..c23e50dcf1b06 100644 --- a/sdks/java/container/boot.go +++ b/sdks/java/container/boot.go @@ -35,7 +35,6 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam/util/execx" "github.com/apache/beam/sdks/v2/go/pkg/beam/util/grpcx" "github.com/apache/beam/sdks/v2/go/pkg/beam/util/syscallx" - "github.com/golang/protobuf/proto" ) var ( @@ -126,12 +125,12 @@ func main() { if err := tools.MakePipelineOptionsFileAndEnvVar(options); err != nil { logger.Fatalf(ctx, "Failed to load pipeline options to worker: %v", err) } - os.Setenv("LOGGING_API_SERVICE_DESCRIPTOR", proto.MarshalTextString(&pipepb.ApiServiceDescriptor{Url: *loggingEndpoint})) - os.Setenv("CONTROL_API_SERVICE_DESCRIPTOR", proto.MarshalTextString(&pipepb.ApiServiceDescriptor{Url: *controlEndpoint})) + os.Setenv("LOGGING_API_SERVICE_DESCRIPTOR", (&pipepb.ApiServiceDescriptor{Url: *loggingEndpoint}).String()) + os.Setenv("CONTROL_API_SERVICE_DESCRIPTOR", (&pipepb.ApiServiceDescriptor{Url: *controlEndpoint}).String()) os.Setenv("RUNNER_CAPABILITIES", strings.Join(info.GetRunnerCapabilities(), " ")) if info.GetStatusEndpoint() != nil { - os.Setenv("STATUS_API_SERVICE_DESCRIPTOR", proto.MarshalTextString(info.GetStatusEndpoint())) + os.Setenv("STATUS_API_SERVICE_DESCRIPTOR", info.GetStatusEndpoint().String()) } const jarsDir = "/opt/apache/beam/jars" diff --git a/sdks/python/container/boot.go b/sdks/python/container/boot.go index 710041e0f0410..696604c64886d 100644 --- a/sdks/python/container/boot.go +++ b/sdks/python/container/boot.go @@ -41,8 +41,8 @@ import ( pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1" "github.com/apache/beam/sdks/v2/go/pkg/beam/util/execx" "github.com/apache/beam/sdks/v2/go/pkg/beam/util/grpcx" - "github.com/golang/protobuf/jsonpb" - "github.com/golang/protobuf/proto" + "google.golang.org/protobuf/encoding/protojson" + "google.golang.org/protobuf/proto" ) var ( @@ -217,12 +217,12 @@ func launchSDKProcess() error { os.Setenv("PIPELINE_OPTIONS", options) os.Setenv("SEMI_PERSISTENT_DIRECTORY", *semiPersistDir) - os.Setenv("LOGGING_API_SERVICE_DESCRIPTOR", proto.MarshalTextString(&pipepb.ApiServiceDescriptor{Url: *loggingEndpoint})) - os.Setenv("CONTROL_API_SERVICE_DESCRIPTOR", proto.MarshalTextString(&pipepb.ApiServiceDescriptor{Url: *controlEndpoint})) + os.Setenv("LOGGING_API_SERVICE_DESCRIPTOR", (&pipepb.ApiServiceDescriptor{Url: *loggingEndpoint}).String()) + os.Setenv("CONTROL_API_SERVICE_DESCRIPTOR", (&pipepb.ApiServiceDescriptor{Url: *controlEndpoint}).String()) os.Setenv("RUNNER_CAPABILITIES", strings.Join(info.GetRunnerCapabilities(), " ")) if info.GetStatusEndpoint() != nil { - os.Setenv("STATUS_API_SERVICE_DESCRIPTOR", proto.MarshalTextString(info.GetStatusEndpoint())) + os.Setenv("STATUS_API_SERVICE_DESCRIPTOR", info.GetStatusEndpoint().String()) } if metadata := info.GetMetadata(); metadata != nil { @@ -441,7 +441,7 @@ func processArtifactsInSetupOnlyMode() { files := make([]string, len(infoJsons)) for i, info := range infoJsons { var artifactInformation pipepb.ArtifactInformation - if err := jsonpb.UnmarshalString(info, &artifactInformation); err != nil { + if err := protojson.Unmarshal([]byte(info), &artifactInformation); err != nil { log.Fatalf("Unable to unmarshal artifact information from json string %v", info) } From b54967eab41f51e9329833d5e2ac18ee522c151c Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud <65791736+ahmedabu98@users.noreply.github.com> Date: Wed, 7 Aug 2024 13:02:29 -0400 Subject: [PATCH 30/78] Fix Beam Schema to Iceberg Schema ID conversion logic (#32095) * fix iceberg schema ID logic * trigger integration tests --- .../IO_Iceberg_Integration_Tests.json | 2 +- .../beam/sdk/io/iceberg/IcebergUtils.java | 181 +++++++++--------- .../beam/sdk/io/iceberg/IcebergUtilsTest.java | 113 +++++------ 3 files changed, 148 insertions(+), 148 deletions(-) diff --git a/.github/trigger_files/IO_Iceberg_Integration_Tests.json b/.github/trigger_files/IO_Iceberg_Integration_Tests.json index 3f63c0c9975f2..bbdc3a3910ef8 100644 --- a/.github/trigger_files/IO_Iceberg_Integration_Tests.json +++ b/.github/trigger_files/IO_Iceberg_Integration_Tests.json @@ -1,4 +1,4 @@ { "comment": "Modify this file in a trivial way to cause this test suite to run", - "modification": 2 + "modification": 3 } diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergUtils.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergUtils.java index a2f84e6475c9c..acd9b25a6a5e3 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergUtils.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergUtils.java @@ -34,6 +34,7 @@ import org.apache.iceberg.data.Record; import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types; +import org.checkerframework.checker.nullness.qual.Nullable; import org.joda.time.DateTime; import org.joda.time.DateTimeZone; @@ -115,113 +116,110 @@ private static Schema icebergStructTypeToBeamSchema(final Types.StructType struc } /** - * Represents an Object (in practice, either {@link Type} or {@link Types.NestedField}) along with - * the most recent (max) ID that has been used to build this object. + * Represents a {@link Type} and the most recent field ID used to build it. * *

Iceberg Schema fields are required to have unique IDs. This includes unique IDs for a {@link - * Types.ListType}'s collection type, a {@link Types.MapType}'s key type and value type, and - * nested {@link Types.StructType}s. When constructing any of these types, we use multiple unique - * ID's for the type's components. The {@code maxId} in this object represents the most recent ID - * used after building this type. This helps signal that the next field we construct should have - * an ID greater than this one. + * org.apache.iceberg.types.Type.NestedType}'s components (e.g. {@link Types.ListType}'s + * collection type, {@link Types.MapType}'s key type and value type, and {@link + * Types.StructType}'s nested fields). The {@code maxId} in this object represents the most recent + * ID used after building this type. This helps signal that the next {@link + * org.apache.iceberg.types.Type.NestedType} we construct should have an ID greater than this one. */ @VisibleForTesting - static class ObjectAndMaxId { + static class TypeAndMaxId { int maxId; - T object; + Type type; - ObjectAndMaxId(int id, T object) { + TypeAndMaxId(int id, Type object) { this.maxId = id; - this.object = object; + this.type = object; } } /** - * Given a Beam {@link Schema.FieldType} and an index, returns an Iceberg {@link Type} and the - * maximum index after building the Iceberg Type. This assumes the input index is already in use - * (usually by the parent {@link Types.NestedField}, and will start building the Iceberg type from - * index + 1. + * Takes a Beam {@link Schema.FieldType} and an index intended as a starting point for Iceberg + * {@link org.apache.iceberg.types.Type.NestedType}s. Returns an Iceberg {@link Type} and the + * maximum index after building that type. * - *

Returns this information in an {@link ObjectAndMaxId} instance. + *

Returns this information in an {@link TypeAndMaxId} object. */ @VisibleForTesting - static ObjectAndMaxId beamFieldTypeToIcebergFieldType( - int fieldId, Schema.FieldType beamType) { + static TypeAndMaxId beamFieldTypeToIcebergFieldType( + Schema.FieldType beamType, int nestedFieldId) { if (BEAM_TYPES_TO_ICEBERG_TYPES.containsKey(beamType.getTypeName())) { - return new ObjectAndMaxId<>(fieldId, BEAM_TYPES_TO_ICEBERG_TYPES.get(beamType.getTypeName())); + // we don't use nested field ID for primitive types. decrement it so the caller can use it for + // other types. + return new TypeAndMaxId( + --nestedFieldId, BEAM_TYPES_TO_ICEBERG_TYPES.get(beamType.getTypeName())); } else if (beamType.getTypeName().isCollectionType()) { // ARRAY or ITERABLE - // List ID needs to be unique from the NestedField that contains this ListType - int listId = fieldId + 1; Schema.FieldType beamCollectionType = Preconditions.checkArgumentNotNull(beamType.getCollectionElementType()); - ObjectAndMaxId listInfo = beamFieldTypeToIcebergFieldType(listId, beamCollectionType); - Type icebergCollectionType = listInfo.object; + // nestedFieldId is reserved for the list's collection type. + // we increment here because further nested fields should use unique ID's + TypeAndMaxId listInfo = + beamFieldTypeToIcebergFieldType(beamCollectionType, nestedFieldId + 1); + Type icebergCollectionType = listInfo.type; boolean elementTypeIsNullable = Preconditions.checkArgumentNotNull(beamType.getCollectionElementType()).getNullable(); Type listType = elementTypeIsNullable - ? Types.ListType.ofOptional(listId, icebergCollectionType) - : Types.ListType.ofRequired(listId, icebergCollectionType); + ? Types.ListType.ofOptional(nestedFieldId, icebergCollectionType) + : Types.ListType.ofRequired(nestedFieldId, icebergCollectionType); - return new ObjectAndMaxId<>(listInfo.maxId, listType); + return new TypeAndMaxId(listInfo.maxId, listType); } else if (beamType.getTypeName().isMapType()) { // MAP - // key and value IDs need to be unique from the NestedField that contains this MapType - int keyId = fieldId + 1; - int valueId = fieldId + 2; - int maxId = valueId; + // key and value IDs need to be unique + int keyId = nestedFieldId; + int valueId = keyId + 1; + // nested field IDs should be unique + nestedFieldId = valueId + 1; Schema.FieldType beamKeyType = Preconditions.checkArgumentNotNull(beamType.getMapKeyType()); - ObjectAndMaxId keyInfo = beamFieldTypeToIcebergFieldType(maxId, beamKeyType); - Type icebergKeyType = keyInfo.object; - maxId = keyInfo.maxId; + TypeAndMaxId keyInfo = beamFieldTypeToIcebergFieldType(beamKeyType, nestedFieldId); + Type icebergKeyType = keyInfo.type; + nestedFieldId = keyInfo.maxId + 1; Schema.FieldType beamValueType = Preconditions.checkArgumentNotNull(beamType.getMapValueType()); - ObjectAndMaxId valueInfo = beamFieldTypeToIcebergFieldType(maxId, beamValueType); - Type icebergValueType = valueInfo.object; - maxId = valueInfo.maxId; + TypeAndMaxId valueInfo = beamFieldTypeToIcebergFieldType(beamValueType, nestedFieldId); + Type icebergValueType = valueInfo.type; Type mapType = beamValueType.getNullable() ? Types.MapType.ofOptional(keyId, valueId, icebergKeyType, icebergValueType) : Types.MapType.ofRequired(keyId, valueId, icebergKeyType, icebergValueType); - return new ObjectAndMaxId<>(maxId, mapType); + return new TypeAndMaxId(valueInfo.maxId, mapType); } else if (beamType.getTypeName().isCompositeType()) { // ROW // Nested field IDs need to be unique from the field that contains this StructType - int maxFieldId = fieldId; - Schema nestedSchema = Preconditions.checkArgumentNotNull(beamType.getRowSchema()); List nestedFields = new ArrayList<>(nestedSchema.getFieldCount()); - for (Schema.Field field : nestedSchema.getFields()) { - ObjectAndMaxId converted = beamFieldToIcebergField(++maxFieldId, field); - Types.NestedField nestedField = converted.object; - nestedFields.add(nestedField); - maxFieldId = converted.maxId; + int icebergFieldId = nestedFieldId; + nestedFieldId = icebergFieldId + nestedSchema.getFieldCount(); + for (Schema.Field beamField : nestedSchema.getFields()) { + TypeAndMaxId typeAndMaxId = + beamFieldTypeToIcebergFieldType(beamField.getType(), nestedFieldId); + Types.NestedField icebergField = + Types.NestedField.of( + icebergFieldId++, + beamField.getType().getNullable(), + beamField.getName(), + typeAndMaxId.type); + + nestedFields.add(icebergField); + nestedFieldId = typeAndMaxId.maxId + 1; } Type structType = Types.StructType.of(nestedFields); - return new ObjectAndMaxId<>(maxFieldId, structType); + return new TypeAndMaxId(nestedFieldId - 1, structType); } - return new ObjectAndMaxId<>(fieldId, Types.StringType.get()); - } - - private static ObjectAndMaxId beamFieldToIcebergField( - int fieldId, final Schema.Field field) { - ObjectAndMaxId typeAndMaxId = beamFieldTypeToIcebergFieldType(fieldId, field.getType()); - Type icebergType = typeAndMaxId.object; - int id = typeAndMaxId.maxId; - - Types.NestedField icebergField = - Types.NestedField.of(fieldId, field.getType().getNullable(), field.getName(), icebergType); - - return new ObjectAndMaxId<>(id, icebergField); + return new TypeAndMaxId(nestedFieldId, Types.StringType.get()); } /** @@ -233,18 +231,23 @@ private static ObjectAndMaxId beamFieldToIcebergField( *

  • {@link Schema.TypeName.LOGICAL_TYPE} */ public static org.apache.iceberg.Schema beamSchemaToIcebergSchema(final Schema schema) { - Types.NestedField[] fields = new Types.NestedField[schema.getFieldCount()]; - int nextIcebergFieldId = 1; - for (int i = 0; i < schema.getFieldCount(); i++) { - Schema.Field beamField = schema.getField(i); - ObjectAndMaxId fieldAndMaxId = - beamFieldToIcebergField(nextIcebergFieldId, beamField); - Types.NestedField field = fieldAndMaxId.object; - fields[i] = field; - - nextIcebergFieldId = fieldAndMaxId.maxId + 1; + List fields = new ArrayList<>(schema.getFieldCount()); + int nestedFieldId = schema.getFieldCount() + 1; + int icebergFieldId = 1; + for (Schema.Field beamField : schema.getFields()) { + TypeAndMaxId typeAndMaxId = + beamFieldTypeToIcebergFieldType(beamField.getType(), nestedFieldId); + Types.NestedField icebergField = + Types.NestedField.of( + icebergFieldId++, + beamField.getType().getNullable(), + beamField.getName(), + typeAndMaxId.type); + + fields.add(icebergField); + nestedFieldId = typeAndMaxId.maxId + 1; } - return new org.apache.iceberg.Schema(fields); + return new org.apache.iceberg.Schema(fields.toArray(new Types.NestedField[fields.size()])); } /** Converts a Beam {@link Row} to an Iceberg {@link Record}. */ @@ -323,27 +326,21 @@ private static void copyFieldIntoRecord(Record rec, Types.NestedField field, Row public static Row icebergRecordToBeamRow(Schema schema, Record record) { Row.Builder rowBuilder = Row.withSchema(schema); for (Schema.Field field : schema.getFields()) { + boolean isNullable = field.getType().getNullable(); + @Nullable Object icebergValue = record.getField(field.getName()); + if (icebergValue == null) { + if (isNullable) { + rowBuilder.addValue(null); + continue; + } + throw new RuntimeException( + String.format("Received null value for required field '%s'.", field.getName())); + } switch (field.getType().getTypeName()) { case BYTE: - // I guess allow anything we can cast here - byte byteValue = (byte) record.getField(field.getName()); - rowBuilder.addValue(byteValue); - break; case INT16: - // I guess allow anything we can cast here - short shortValue = (short) record.getField(field.getName()); - rowBuilder.addValue(shortValue); - break; case INT32: - // I guess allow anything we can cast here - int intValue = (int) record.getField(field.getName()); - rowBuilder.addValue(intValue); - break; case INT64: - // I guess allow anything we can cast here - long longValue = (long) record.getField(field.getName()); - rowBuilder.addValue(longValue); - break; case DECIMAL: // Iceberg and Beam both use BigDecimal case FLOAT: // Iceberg and Beam both use float case DOUBLE: // Iceberg and Beam both use double @@ -352,29 +349,31 @@ public static Row icebergRecordToBeamRow(Schema schema, Record record) { case ARRAY: case ITERABLE: case MAP: - rowBuilder.addValue(record.getField(field.getName())); + rowBuilder.addValue(icebergValue); break; case DATETIME: // Iceberg uses a long for millis; Beam uses joda time DateTime - long millis = (long) record.getField(field.getName()); + long millis = (long) icebergValue; rowBuilder.addValue(new DateTime(millis, DateTimeZone.UTC)); break; case BYTES: // Iceberg uses ByteBuffer; Beam uses byte[] - rowBuilder.addValue(((ByteBuffer) record.getField(field.getName())).array()); + rowBuilder.addValue(((ByteBuffer) icebergValue).array()); break; case ROW: - Record nestedRecord = (Record) record.getField(field.getName()); + Record nestedRecord = (Record) icebergValue; Schema nestedSchema = checkArgumentNotNull( field.getType().getRowSchema(), "Corrupted schema: Row type did not have associated nested schema."); - Row nestedRow = icebergRecordToBeamRow(nestedSchema, nestedRecord); - rowBuilder.addValue(nestedRow); + rowBuilder.addValue(icebergRecordToBeamRow(nestedSchema, nestedRecord)); break; case LOGICAL_TYPE: throw new UnsupportedOperationException( "Cannot convert iceberg field to Beam logical type"); + default: + throw new UnsupportedOperationException( + "Unsupported Beam type: " + field.getType().getTypeName()); } } return rowBuilder.build(); diff --git a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergUtilsTest.java b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergUtilsTest.java index c4da0b22f4d95..a20d5b7c8f59a 100644 --- a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergUtilsTest.java +++ b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergUtilsTest.java @@ -17,7 +17,7 @@ */ package org.apache.beam.sdk.io.iceberg; -import static org.apache.beam.sdk.io.iceberg.IcebergUtils.ObjectAndMaxId; +import static org.apache.beam.sdk.io.iceberg.IcebergUtils.TypeAndMaxId; import static org.apache.beam.sdk.io.iceberg.IcebergUtils.beamFieldTypeToIcebergFieldType; import static org.apache.iceberg.types.Types.NestedField.optional; import static org.apache.iceberg.types.Types.NestedField.required; @@ -316,11 +316,11 @@ private static class BeamFieldTypeTestCase { private void checkTypes(List testCases) { for (BeamFieldTypeTestCase testCase : testCases) { - ObjectAndMaxId ret = - beamFieldTypeToIcebergFieldType(testCase.icebergFieldId, testCase.beamType); + TypeAndMaxId ret = + beamFieldTypeToIcebergFieldType(testCase.beamType, testCase.icebergFieldId); assertEquals(testCase.expectedMaxId, ret.maxId); - checkEquals(testCase.expectedIcebergType, ret.object); + checkEquals(testCase.expectedIcebergType, ret.type); } } @@ -338,65 +338,65 @@ private void checkEquals(Type expected, Type actual) { @Test public void testPrimitiveBeamFieldTypeToIcebergFieldType() { + // primitive types don't use the nested field ID List primitives = Arrays.asList( - new BeamFieldTypeTestCase(1, Schema.FieldType.BOOLEAN, 1, Types.BooleanType.get()), - new BeamFieldTypeTestCase(3, Schema.FieldType.INT32, 3, Types.IntegerType.get()), - new BeamFieldTypeTestCase(6, Schema.FieldType.INT64, 6, Types.LongType.get()), - new BeamFieldTypeTestCase(10, Schema.FieldType.FLOAT, 10, Types.FloatType.get()), - new BeamFieldTypeTestCase(7, Schema.FieldType.DOUBLE, 7, Types.DoubleType.get()), - new BeamFieldTypeTestCase(11, Schema.FieldType.STRING, 11, Types.StringType.get()), - new BeamFieldTypeTestCase(15, Schema.FieldType.BYTES, 15, Types.BinaryType.get())); + new BeamFieldTypeTestCase(1, Schema.FieldType.BOOLEAN, 0, Types.BooleanType.get()), + new BeamFieldTypeTestCase(3, Schema.FieldType.INT32, 2, Types.IntegerType.get()), + new BeamFieldTypeTestCase(6, Schema.FieldType.INT64, 5, Types.LongType.get()), + new BeamFieldTypeTestCase(10, Schema.FieldType.FLOAT, 9, Types.FloatType.get()), + new BeamFieldTypeTestCase(7, Schema.FieldType.DOUBLE, 6, Types.DoubleType.get()), + new BeamFieldTypeTestCase(11, Schema.FieldType.STRING, 10, Types.StringType.get()), + new BeamFieldTypeTestCase(15, Schema.FieldType.BYTES, 14, Types.BinaryType.get())); checkTypes(primitives); } @Test public void testArrayBeamFieldTypeToIcebergFieldType() { - // Iceberg sets one field ID for the List type itself and another field ID for the collection - // type. + // Iceberg's ListType reserves one nested ID for its element type List listTypes = Arrays.asList( new BeamFieldTypeTestCase( 1, Schema.FieldType.array(Schema.FieldType.BOOLEAN), - 2, + 1, Types.ListType.ofRequired(1, Types.BooleanType.get())), new BeamFieldTypeTestCase( 3, Schema.FieldType.iterable(Schema.FieldType.INT32), - 4, + 3, Types.ListType.ofRequired(3, Types.IntegerType.get())), new BeamFieldTypeTestCase( 6, Schema.FieldType.array(Schema.FieldType.INT64), - 7, + 6, Types.ListType.ofRequired(6, Types.LongType.get())), new BeamFieldTypeTestCase( 10, Schema.FieldType.array(Schema.FieldType.FLOAT), - 11, + 10, Types.ListType.ofRequired(10, Types.FloatType.get())), new BeamFieldTypeTestCase( 7, Schema.FieldType.iterable(Schema.FieldType.DOUBLE), - 8, + 7, Types.ListType.ofRequired(7, Types.DoubleType.get())), new BeamFieldTypeTestCase( 11, Schema.FieldType.array(Schema.FieldType.STRING), - 12, + 11, Types.ListType.ofRequired(11, Types.StringType.get())), new BeamFieldTypeTestCase( 15, Schema.FieldType.iterable(Schema.FieldType.BYTES), - 16, + 15, Types.ListType.ofRequired(15, Types.BinaryType.get())), new BeamFieldTypeTestCase( 23, Schema.FieldType.array( Schema.FieldType.array(Schema.FieldType.iterable(Schema.FieldType.STRING))), - 26, + 25, Types.ListType.ofRequired( 23, Types.ListType.ofRequired( @@ -407,23 +407,23 @@ public void testArrayBeamFieldTypeToIcebergFieldType() { @Test public void testStructBeamFieldTypeToIcebergFieldType() { - // Iceberg sets one field ID for each nested type. + // Iceberg sets one unique field ID for each nested type. List listTypes = Arrays.asList( new BeamFieldTypeTestCase( 1, Schema.FieldType.row(Schema.builder().addStringField("str").build()), - 2, + 1, Types.StructType.of( - Types.NestedField.required(2, "str", Types.StringType.get()))), + Types.NestedField.required(1, "str", Types.StringType.get()))), new BeamFieldTypeTestCase( 3, Schema.FieldType.row(Schema.builder().addInt32Field("int").build()), - 4, + 3, Types.StructType.of( - Types.NestedField.required(4, "int", Types.IntegerType.get()))), + Types.NestedField.required(3, "int", Types.IntegerType.get()))), new BeamFieldTypeTestCase( - 0, + 1, Schema.FieldType.row(BEAM_SCHEMA_PRIMITIVE), 7, Types.StructType.of(ICEBERG_SCHEMA_PRIMITIVE.columns())), @@ -434,11 +434,11 @@ public void testStructBeamFieldTypeToIcebergFieldType() { .addArrayField("arr", Schema.FieldType.STRING) .addNullableStringField("str") .build()), - 18, + 17, Types.StructType.of( Types.NestedField.required( - 16, "arr", Types.ListType.ofRequired(17, Types.StringType.get())), - Types.NestedField.optional(18, "str", Types.StringType.get()))), + 15, "arr", Types.ListType.ofRequired(17, Types.StringType.get())), + Types.NestedField.optional(16, "str", Types.StringType.get()))), new BeamFieldTypeTestCase( 20, Schema.FieldType.row( @@ -452,10 +452,10 @@ public void testStructBeamFieldTypeToIcebergFieldType() { .addNullableRowField( "nullable_row", Schema.builder().addInt64Field("long").build()) .build()), - 25, + 24, Types.StructType.of( Types.NestedField.required( - 21, + 20, "row", Types.StructType.of( Types.NestedField.required( @@ -465,33 +465,34 @@ public void testStructBeamFieldTypeToIcebergFieldType() { Types.NestedField.required( 23, "str", Types.StringType.get()))))), Types.NestedField.optional( - 24, + 21, "nullable_row", Types.StructType.of( - Types.NestedField.required(25, "long", Types.LongType.get())))))); + Types.NestedField.required(24, "long", Types.LongType.get())))))); checkTypes(listTypes); } @Test public void testMapBeamFieldTypeToIcebergFieldType() { + // Iceberg's MapType reserves two nested IDs. one for its key type and one for its value type. List primitives = Arrays.asList( new BeamFieldTypeTestCase( 1, Schema.FieldType.map(Schema.FieldType.STRING, Schema.FieldType.INT32), - 3, - Types.MapType.ofRequired(2, 3, Types.StringType.get(), Types.IntegerType.get())), + 2, + Types.MapType.ofRequired(1, 2, Types.StringType.get(), Types.IntegerType.get())), new BeamFieldTypeTestCase( 6, Schema.FieldType.map( Schema.FieldType.FLOAT, Schema.FieldType.array(Schema.FieldType.STRING)), - 9, + 8, Types.MapType.ofRequired( + 6, 7, - 8, Types.FloatType.get(), - Types.ListType.ofRequired(9, Types.StringType.get()))), + Types.ListType.ofRequired(8, Types.StringType.get()))), new BeamFieldTypeTestCase( 10, Schema.FieldType.map( @@ -499,30 +500,30 @@ public void testMapBeamFieldTypeToIcebergFieldType() { Schema.FieldType.map( Schema.FieldType.BOOLEAN, Schema.FieldType.map(Schema.FieldType.STRING, Schema.FieldType.INT32))), - 16, + 15, Types.MapType.ofRequired( + 10, 11, - 12, Types.StringType.get(), Types.MapType.ofRequired( + 12, 13, - 14, Types.BooleanType.get(), Types.MapType.ofRequired( - 15, 16, Types.StringType.get(), Types.IntegerType.get())))), + 14, 15, Types.StringType.get(), Types.IntegerType.get())))), new BeamFieldTypeTestCase( 15, Schema.FieldType.map( Schema.FieldType.row(Schema.builder().addStringField("str").build()), Schema.FieldType.row(Schema.builder().addInt32Field("int").build())), - 19, + 18, Types.MapType.ofRequired( + 15, 16, - 17, Types.StructType.of( - Types.NestedField.required(18, "str", Types.StringType.get())), + Types.NestedField.required(17, "str", Types.StringType.get())), Types.StructType.of( - Types.NestedField.required(19, "int", Types.IntegerType.get()))))); + Types.NestedField.required(18, "int", Types.IntegerType.get()))))); checkTypes(primitives); } @@ -574,9 +575,9 @@ public void testPrimitiveIcebergSchemaToBeamSchema() { .build(); static final org.apache.iceberg.Schema ICEBERG_SCHEMA_LIST = new org.apache.iceberg.Schema( - required(1, "arr_str", Types.ListType.ofRequired(2, Types.StringType.get())), - required(3, "arr_int", Types.ListType.ofRequired(4, Types.IntegerType.get())), - required(5, "arr_bool", Types.ListType.ofRequired(6, Types.BooleanType.get()))); + required(1, "arr_str", Types.ListType.ofRequired(4, Types.StringType.get())), + required(2, "arr_int", Types.ListType.ofRequired(5, Types.IntegerType.get())), + required(3, "arr_bool", Types.ListType.ofRequired(6, Types.BooleanType.get()))); @Test public void testArrayBeamSchemaToIcebergSchema() { @@ -607,9 +608,9 @@ public void testArrayIcebergSchemaToBeamSchema() { required( 1, "str_int", - Types.MapType.ofRequired(2, 3, Types.StringType.get(), Types.IntegerType.get())), + Types.MapType.ofRequired(3, 4, Types.StringType.get(), Types.IntegerType.get())), optional( - 4, + 2, "long_bool", Types.MapType.ofRequired(5, 6, Types.LongType.get(), Types.BooleanType.get()))); @@ -648,11 +649,11 @@ public void testMapIcebergSchemaToBeamSchema() { 1, "row", Types.StructType.of( - required(2, "str", Types.StringType.get()), - optional(3, "int", Types.IntegerType.get()), - required(4, "long", Types.LongType.get()))), + required(3, "str", Types.StringType.get()), + optional(4, "int", Types.IntegerType.get()), + required(5, "long", Types.LongType.get()))), optional( - 5, + 2, "nullable_row", Types.StructType.of( optional(6, "str", Types.StringType.get()), From 07e692b56fb19550c40eede5d39b951851eb8980 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 7 Aug 2024 10:06:22 -0700 Subject: [PATCH 31/78] Bump github.com/nats-io/nats-server/v2 from 2.10.12 to 2.10.17 in /sdks (#31709) Bumps [github.com/nats-io/nats-server/v2](https://github.com/nats-io/nats-server) from 2.10.12 to 2.10.17. - [Release notes](https://github.com/nats-io/nats-server/releases) - [Changelog](https://github.com/nats-io/nats-server/blob/main/.goreleaser.yml) - [Commits](https://github.com/nats-io/nats-server/compare/v2.10.12...v2.10.17) --- updated-dependencies: - dependency-name: github.com/nats-io/nats-server/v2 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- sdks/go.mod | 14 +++++++------- sdks/go.sum | 32 ++++++++++++++++---------------- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/sdks/go.mod b/sdks/go.mod index fb0b7f85f3dea..654a456285431 100644 --- a/sdks/go.mod +++ b/sdks/go.mod @@ -44,8 +44,8 @@ require ( github.com/johannesboyne/gofakes3 v0.0.0-20221110173912-32fb85c5aed6 github.com/lib/pq v1.10.9 github.com/linkedin/goavro/v2 v2.13.0 - github.com/nats-io/nats-server/v2 v2.10.16 - github.com/nats-io/nats.go v1.35.0 + github.com/nats-io/nats-server/v2 v2.10.18 + github.com/nats-io/nats.go v1.36.0 github.com/proullon/ramsql v0.1.3 github.com/spf13/cobra v1.8.1 github.com/testcontainers/testcontainers-go v0.32.0 @@ -56,7 +56,7 @@ require ( golang.org/x/net v0.26.0 golang.org/x/oauth2 v0.21.0 golang.org/x/sync v0.8.0 - golang.org/x/sys v0.21.0 + golang.org/x/sys v0.22.0 golang.org/x/text v0.17.0 google.golang.org/api v0.187.0 google.golang.org/genproto v0.0.0-20240624140628-dc46fd24d27d @@ -89,10 +89,10 @@ require ( github.com/go-ole/go-ole v1.2.6 // indirect github.com/golang/protobuf v1.5.4 // indirect github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect - github.com/minio/highwayhash v1.0.2 // indirect + github.com/minio/highwayhash v1.0.3 // indirect github.com/moby/docker-image-spec v1.3.1 // indirect github.com/moby/sys/user v0.1.0 // indirect - github.com/nats-io/jwt/v2 v2.5.7 // indirect + github.com/nats-io/jwt/v2 v2.5.8 // indirect github.com/nats-io/nkeys v0.4.7 // indirect github.com/nats-io/nuid v1.0.1 // indirect github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect @@ -158,7 +158,7 @@ require ( github.com/gorilla/handlers v1.5.2 // indirect github.com/gorilla/mux v1.8.1 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect - github.com/klauspost/compress v1.17.8 // indirect + github.com/klauspost/compress v1.17.9 // indirect github.com/klauspost/cpuid/v2 v2.2.6 // indirect github.com/magiconair/properties v1.8.7 // indirect github.com/moby/patternmatcher v0.6.0 // indirect @@ -181,7 +181,7 @@ require ( github.com/youmark/pkcs8 v0.0.0-20181117223130-1be2e3e5546d // indirect github.com/zeebo/xxh3 v1.0.2 // indirect go.opencensus.io v0.24.0 // indirect - golang.org/x/crypto v0.24.0 // indirect + golang.org/x/crypto v0.25.0 // indirect golang.org/x/mod v0.17.0 // indirect golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d // indirect golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 // indirect diff --git a/sdks/go.sum b/sdks/go.sum index 6afc175732ff3..1c03d5afc89cd 100644 --- a/sdks/go.sum +++ b/sdks/go.sum @@ -1019,8 +1019,8 @@ github.com/klauspost/asmfmt v1.3.2/go.mod h1:AG8TuvYojzulgDAMCnYn50l/5QV3Bs/tp6j github.com/klauspost/compress v1.9.7/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A= github.com/klauspost/compress v1.13.1/go.mod h1:8dP1Hq4DHOhN9w426knH3Rhby4rFm6D8eO+e+Dq5Gzg= github.com/klauspost/compress v1.15.9/go.mod h1:PhcZ0MbTNciWF3rruxRgKxI5NkcHHrHUDtV4Yw2GlzU= -github.com/klauspost/compress v1.17.8 h1:YcnTYrq7MikUT7k0Yb5eceMmALQPYBW/Xltxn0NAMnU= -github.com/klauspost/compress v1.17.8/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw= +github.com/klauspost/compress v1.17.9 h1:6KIumPrER1LHsvBVuDa0r5xaG0Es51mhhB9BQB2qeMA= +github.com/klauspost/compress v1.17.9/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw= github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= github.com/klauspost/cpuid/v2 v2.2.6 h1:ndNyv040zDGIDh8thGkXYjnFtiN02M1PVVF+JE/48xc= github.com/klauspost/cpuid/v2 v2.2.6/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws= @@ -1051,8 +1051,8 @@ github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/ github.com/mattn/go-sqlite3 v1.14.14/go.mod h1:NyWgC/yNuGj7Q9rpYnZvas74GogHl5/Z4A/KQRfk6bU= github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8/go.mod h1:mC1jAcsrzbxHt8iiaC+zU4b1ylILSosueou12R++wfY= github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3/go.mod h1:RagcQ7I8IeTMnF8JTXieKnO4Z6JCsikNEzj0DwauVzE= -github.com/minio/highwayhash v1.0.2 h1:Aak5U0nElisjDCfPSG79Tgzkn2gl66NxOMspRrKnA/g= -github.com/minio/highwayhash v1.0.2/go.mod h1:BQskDq+xkJ12lmlUUi7U0M5Swg3EWR+dLTk+kldvVxY= +github.com/minio/highwayhash v1.0.3 h1:kbnuUMoHYyVl7szWjSxJnxw11k2U709jqFPPmIUyD6Q= +github.com/minio/highwayhash v1.0.3/go.mod h1:GGYsuwP/fPD6Y9hMiXuapVvlIUEhFhMTh0rxU3ik1LQ= github.com/minio/md5-simd v1.1.2 h1:Gdi1DZK69+ZVMoNHRXJyNcxrMA4dSxoYHZSQbirFg34= github.com/minio/md5-simd v1.1.2/go.mod h1:MzdKDxYpY2BT9XQFocsiZf/NKVtR7nkE4RoEpN+20RM= github.com/minio/minio-go/v7 v7.0.66 h1:bnTOXOHjOqv/gcMuiVbN9o2ngRItvqE774dG9nq0Dzw= @@ -1077,12 +1077,12 @@ github.com/montanaflynn/stats v0.7.1 h1:etflOAAHORrCC44V+aR6Ftzort912ZU+YLiSTuV8 github.com/montanaflynn/stats v0.7.1/go.mod h1:etXPPgVO6n31NxCd9KQUMvCM+ve0ruNzt6R8Bnaayow= github.com/morikuni/aec v1.0.0 h1:nP9CBfwrvYnBRgY6qfDQkygYDmYwOilePFkwzv4dU8A= github.com/morikuni/aec v1.0.0/go.mod h1:BbKIizmSmc5MMPqRYbxO4ZU0S0+P200+tUnFx7PXmsc= -github.com/nats-io/jwt/v2 v2.5.7 h1:j5lH1fUXCnJnY8SsQeB/a/z9Azgu2bYIDvtPVNdxe2c= -github.com/nats-io/jwt/v2 v2.5.7/go.mod h1:ZdWS1nZa6WMZfFwwgpEaqBV8EPGVgOTDHN/wTbz0Y5A= -github.com/nats-io/nats-server/v2 v2.10.16 h1:2jXaiydp5oB/nAx/Ytf9fdCi9QN6ItIc9eehX8kwVV0= -github.com/nats-io/nats-server/v2 v2.10.16/go.mod h1:Pksi38H2+6xLe1vQx0/EA4bzetM0NqyIHcIbmgXSkIU= -github.com/nats-io/nats.go v1.35.0 h1:XFNqNM7v5B+MQMKqVGAyHwYhyKb48jrenXNxIU20ULk= -github.com/nats-io/nats.go v1.35.0/go.mod h1:Ubdu4Nh9exXdSz0RVWRFBbRfrbSxOYd26oF0wkWclB8= +github.com/nats-io/jwt/v2 v2.5.8 h1:uvdSzwWiEGWGXf+0Q+70qv6AQdvcvxrv9hPM0RiPamE= +github.com/nats-io/jwt/v2 v2.5.8/go.mod h1:ZdWS1nZa6WMZfFwwgpEaqBV8EPGVgOTDHN/wTbz0Y5A= +github.com/nats-io/nats-server/v2 v2.10.18 h1:tRdZmBuWKVAFYtayqlBB2BuCHNGAQPvoQIXOKwU3WSM= +github.com/nats-io/nats-server/v2 v2.10.18/go.mod h1:97Qyg7YydD8blKlR8yBsUlPlWyZKjA7Bp5cl3MUE9K8= +github.com/nats-io/nats.go v1.36.0 h1:suEUPuWzTSse/XhESwqLxXGuj8vGRuPRoG7MoRN/qyU= +github.com/nats-io/nats.go v1.36.0/go.mod h1:Ubdu4Nh9exXdSz0RVWRFBbRfrbSxOYd26oF0wkWclB8= github.com/nats-io/nkeys v0.4.7 h1:RwNJbbIdYCoClSDNY7QVKZlyb/wfT6ugvFCiKy6vDvI= github.com/nats-io/nkeys v0.4.7/go.mod h1:kqXRgRDPlGy7nGaEDMuYzmiJCIAAWDK0IMBtDmGD0nc= github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw= @@ -1251,8 +1251,8 @@ golang.org/x/crypto v0.0.0-20210421170649-83a5a9bb288b/go.mod h1:T9bdIzuCu7OtxOm golang.org/x/crypto v0.0.0-20210513164829-c07d793c2f9a/go.mod h1:P+XmwS30IXTQdn5tA2iutPOUgjI07+tq3H3K9MVA1s8= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.0.0-20211108221036-ceb1ce70b4fa/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= -golang.org/x/crypto v0.24.0 h1:mnl8DM0o513X8fdIkmyFE/5hTYxbwYOjDS/+rK6qpRI= -golang.org/x/crypto v0.24.0/go.mod h1:Z1PMYSOR5nyMcyAVAIQSKCDwalqy85Aqn1x3Ws4L5DM= +golang.org/x/crypto v0.25.0 h1:ypSNr+bnYL2YhwoMt2zPxHFmbAN1KZs/njMG3hxUp30= +golang.org/x/crypto v0.25.0/go.mod h1:T+wALwcMOSE0kXgUAnPAHqTLW+XHgcELELW8VaDgm/M= golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= @@ -1425,7 +1425,6 @@ golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20190130150945-aca44879d564/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -1511,8 +1510,9 @@ golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/sys v0.21.0 h1:rF+pYz3DAGSQAxAu1CbC7catZg4ebC4UIeIhKxBZvws= golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.22.0 h1:RI27ohtqKCnwULzJLqkv897zojh5/DwS/ENaMzUOaWI= +golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.2.0/go.mod h1:TVmDHMZPmdnySmBfhjOoOdhjzdE1h4u1VwSiw2l1Nuc= @@ -1521,8 +1521,8 @@ golang.org/x/term v0.4.0/go.mod h1:9P2UbLfCdcvo3p/nzKvsmas4TnlujnuoV9hGgYzW1lQ= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= golang.org/x/term v0.6.0/go.mod h1:m6U89DPEgQRMq3DNkDClhWw02AUbt2daBVO4cn4Hv9U= golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY= -golang.org/x/term v0.21.0 h1:WVXCp+/EBEHOj53Rvu+7KiT/iElMrO8ACK16SMZ3jaA= -golang.org/x/term v0.21.0/go.mod h1:ooXLefLobQVslOqselCNF4SxFAaoS6KujMbsGzSDmX0= +golang.org/x/term v0.22.0 h1:BbsgPEJULsl2fV/AT3v15Mjva5yXKQDyKf+TbDz7QJk= +golang.org/x/term v0.22.0/go.mod h1:F3qCibpT5AMpCRfhfT53vVJwhLtIVHhB9XDjfFvnMI4= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= From 656a296a82d00f1d17745bbe6a161e75104506e7 Mon Sep 17 00:00:00 2001 From: Damon Date: Wed, 7 Aug 2024 13:16:00 -0700 Subject: [PATCH 32/78] Enable Job management for the Prism runner (#32091) --- .../beam/runners/prism/PrismJobManager.java | 160 +++++++++++++ .../runners/prism/PrismJobManagerTest.java | 211 ++++++++++++++++++ 2 files changed, 371 insertions(+) create mode 100644 runners/prism/java/src/main/java/org/apache/beam/runners/prism/PrismJobManager.java create mode 100644 runners/prism/java/src/test/java/org/apache/beam/runners/prism/PrismJobManagerTest.java diff --git a/runners/prism/java/src/main/java/org/apache/beam/runners/prism/PrismJobManager.java b/runners/prism/java/src/main/java/org/apache/beam/runners/prism/PrismJobManager.java new file mode 100644 index 0000000000000..e461e92c47496 --- /dev/null +++ b/runners/prism/java/src/main/java/org/apache/beam/runners/prism/PrismJobManager.java @@ -0,0 +1,160 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.prism; + +import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkState; + +import com.google.auto.value.AutoValue; +import java.io.Closeable; +import java.util.Optional; +import java.util.concurrent.TimeUnit; +import org.apache.beam.model.jobmanagement.v1.JobApi; +import org.apache.beam.model.jobmanagement.v1.JobServiceGrpc; +import org.apache.beam.model.pipeline.v1.Endpoints; +import org.apache.beam.sdk.PipelineResult; +import org.apache.beam.sdk.fn.channel.ManagedChannelFactory; +import org.apache.beam.sdk.options.PortablePipelineOptions; +import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.ManagedChannel; +import org.joda.time.Duration; + +/** + * A wrapper for {@link JobServiceGrpc.JobServiceBlockingStub} that {@link #close}es when {@link + * StateListener#onStateChanged} is invoked with a {@link PipelineResult.State} that is {@link + * PipelineResult.State#isTerminal}. + */ +@AutoValue +abstract class PrismJobManager implements StateListener, Closeable { + + /** + * Instantiate a {@link PrismJobManager} with {@param options}, assigning {@link #getEndpoint} + * from {@link PortablePipelineOptions#getJobEndpoint} and {@link #getTimeout} from {@link + * PortablePipelineOptions#getJobServerTimeout}. Defaults the instantiations of {@link + * #getManagedChannel} and {@link #getBlockingStub}. See respective getters for more details. + */ + static PrismJobManager of(PortablePipelineOptions options) { + return builder() + .setEndpoint(options.getJobEndpoint()) + .setTimeout(Duration.standardSeconds(options.getJobServerTimeout())) + .build(); + } + + static Builder builder() { + return new AutoValue_PrismJobManager.Builder(); + } + + /** + * Executes {@link #getBlockingStub()}'s {@link JobServiceGrpc.JobServiceBlockingStub#prepare} + * method. + */ + JobApi.PrepareJobResponse prepare(JobApi.PrepareJobRequest request) { + return getBlockingStub().prepare(request); + } + + /** + * Executes {@link #getBlockingStub()}'s {@link JobServiceGrpc.JobServiceBlockingStub#run} method. + */ + JobApi.RunJobResponse run(JobApi.RunJobRequest request) { + return getBlockingStub().run(request); + } + + /** The {@link JobServiceGrpc} endpoint. */ + abstract String getEndpoint(); + + /** The {@link JobServiceGrpc} timeout. */ + abstract Duration getTimeout(); + + /** The {@link #getBlockingStub}'s channel. Defaulted from the {@link #getEndpoint()}. */ + abstract ManagedChannel getManagedChannel(); + + /** The wrapped service defaulted using the {@link #getManagedChannel}. */ + abstract JobServiceGrpc.JobServiceBlockingStub getBlockingStub(); + + /** Shuts down {@link #getManagedChannel}, if not {@link #isShutdown}. */ + @Override + public void close() { + if (isShutdown()) { + return; + } + getManagedChannel().shutdown(); + try { + getManagedChannel().awaitTermination(3000L, TimeUnit.MILLISECONDS); + } catch (InterruptedException ignored) { + } + } + + /** Queries whether {@link #getManagedChannel} {@link ManagedChannel#isShutdown}. */ + boolean isShutdown() { + return getManagedChannel().isShutdown(); + } + + /** + * Override of {@link StateListener#onStateChanged}. Invokes {@link #close} when {@link + * PipelineResult.State} {@link PipelineResult.State#isTerminal}. + */ + @Override + public void onStateChanged(PipelineResult.State state) { + if (state.isTerminal()) { + close(); + } + } + + @AutoValue.Builder + abstract static class Builder { + + abstract Builder setEndpoint(String endpoint); + + abstract Optional getEndpoint(); + + abstract Builder setTimeout(Duration timeout); + + abstract Optional getTimeout(); + + abstract Builder setManagedChannel(ManagedChannel managedChannel); + + abstract Optional getManagedChannel(); + + abstract Builder setBlockingStub(JobServiceGrpc.JobServiceBlockingStub blockingStub); + + abstract Optional getBlockingStub(); + + abstract PrismJobManager autoBuild(); + + final PrismJobManager build() { + + checkState(getEndpoint().isPresent(), "endpoint is not set"); + checkState(getTimeout().isPresent(), "timeout is not set"); + + if (!getManagedChannel().isPresent()) { + ManagedChannelFactory channelFactory = ManagedChannelFactory.createDefault(); + + setManagedChannel( + channelFactory.forDescriptor( + Endpoints.ApiServiceDescriptor.newBuilder().setUrl(getEndpoint().get()).build())); + } + + if (!getBlockingStub().isPresent()) { + setBlockingStub( + JobServiceGrpc.newBlockingStub(getManagedChannel().get()) + .withDeadlineAfter(getTimeout().get().getMillis(), TimeUnit.MILLISECONDS) + .withWaitForReady()); + } + + return autoBuild(); + } + } +} diff --git a/runners/prism/java/src/test/java/org/apache/beam/runners/prism/PrismJobManagerTest.java b/runners/prism/java/src/test/java/org/apache/beam/runners/prism/PrismJobManagerTest.java new file mode 100644 index 0000000000000..1e38e4f8d12ed --- /dev/null +++ b/runners/prism/java/src/test/java/org/apache/beam/runners/prism/PrismJobManagerTest.java @@ -0,0 +1,211 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.prism; + +import static com.google.common.truth.Truth.assertThat; +import static org.junit.Assert.assertThrows; + +import java.io.IOException; +import java.util.Optional; +import org.apache.beam.model.jobmanagement.v1.JobApi; +import org.apache.beam.model.jobmanagement.v1.JobServiceGrpc; +import org.apache.beam.model.pipeline.v1.Endpoints; +import org.apache.beam.model.pipeline.v1.RunnerApi; +import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.PipelineResult; +import org.apache.beam.sdk.transforms.Impulse; +import org.apache.beam.sdk.util.construction.PipelineTranslation; +import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.ManagedChannel; +import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.inprocess.InProcessChannelBuilder; +import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.inprocess.InProcessServerBuilder; +import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.stub.StreamObserver; +import org.apache.beam.vendor.grpc.v1p60p1.io.grpc.testing.GrpcCleanupRule; +import org.joda.time.Duration; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TestName; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +/** Tests for {@link PrismJobManager}. */ +@RunWith(JUnit4.class) +public class PrismJobManagerTest { + @Rule public final GrpcCleanupRule grpcCleanup = new GrpcCleanupRule(); + + @Rule public TestName testName = new TestName(); + + @Test + public void givenPrepareError_forwardsException_canGracefulShutdown() { + TestJobService service = + new TestJobService().withErrorResponse(new RuntimeException(testName.getMethodName())); + PrismJobManager underTest = prismJobManager(service); + assertThat(underTest.isShutdown()).isFalse(); + assertThrows( + RuntimeException.class, + () -> + underTest.prepare( + JobApi.PrepareJobRequest.newBuilder().setPipeline(pipelineOf()).build())); + assertThat(underTest.isShutdown()).isFalse(); + underTest.close(); + assertThat(underTest.isShutdown()).isTrue(); + } + + @Test + public void givenPrepareSuccess_forwardsResponse_canGracefulShutdown() { + TestJobService service = + new TestJobService() + .withPrepareJobResponse( + JobApi.PrepareJobResponse.newBuilder() + .setStagingSessionToken("token") + .setPreparationId("preparationId") + .setArtifactStagingEndpoint( + Endpoints.ApiServiceDescriptor.newBuilder() + .setUrl("localhost:1234") + .build()) + .build()); + PrismJobManager underTest = prismJobManager(service); + assertThat(underTest.isShutdown()).isFalse(); + JobApi.PrepareJobResponse response = + underTest.prepare(JobApi.PrepareJobRequest.newBuilder().setPipeline(pipelineOf()).build()); + assertThat(underTest.isShutdown()).isFalse(); + assertThat(response.getStagingSessionToken()).isEqualTo("token"); + assertThat(response.getPreparationId()).isEqualTo("preparationId"); + underTest.close(); + assertThat(underTest.isShutdown()).isTrue(); + } + + @Test + public void givenRunError_forwardsException_canGracefulShutdown() { + TestJobService service = + new TestJobService().withErrorResponse(new RuntimeException(testName.getMethodName())); + PrismJobManager underTest = prismJobManager(service); + assertThat(underTest.isShutdown()).isFalse(); + assertThrows( + RuntimeException.class, + () -> + underTest.run(JobApi.RunJobRequest.newBuilder().setPreparationId("prepareId").build())); + assertThat(underTest.isShutdown()).isFalse(); + underTest.close(); + assertThat(underTest.isShutdown()).isTrue(); + } + + @Test + public void givenRunSuccess_forwardsResponse_canGracefulShutdown() { + TestJobService service = + new TestJobService() + .withRunJobResponse(JobApi.RunJobResponse.newBuilder().setJobId("jobId").build()); + PrismJobManager underTest = prismJobManager(service); + assertThat(underTest.isShutdown()).isFalse(); + JobApi.RunJobResponse runJobResponse = + underTest.run(JobApi.RunJobRequest.newBuilder().setPreparationId("preparationId").build()); + assertThat(underTest.isShutdown()).isFalse(); + assertThat(runJobResponse.getJobId()).isEqualTo("jobId"); + underTest.close(); + assertThat(underTest.isShutdown()).isTrue(); + } + + @Test + public void givenTerminalState_closes() { + PrismJobManager underTest = prismJobManager(new TestJobService()); + assertThat(underTest.isShutdown()).isFalse(); + underTest.onStateChanged(PipelineResult.State.RUNNING); + assertThat(underTest.isShutdown()).isFalse(); + underTest.onStateChanged(PipelineResult.State.RUNNING); + assertThat(underTest.isShutdown()).isFalse(); + underTest.onStateChanged(PipelineResult.State.CANCELLED); + assertThat(underTest.isShutdown()).isTrue(); + + underTest.close(); + } + + private PrismJobManager prismJobManager(TestJobService service) { + String serverName = InProcessServerBuilder.generateName(); + try { + grpcCleanup.register( + InProcessServerBuilder.forName(serverName) + .directExecutor() + .addService(service) + .build() + .start()); + } catch (IOException e) { + throw new RuntimeException(e); + } + + ManagedChannel channel = + grpcCleanup.register(InProcessChannelBuilder.forName(serverName).build()); + + return PrismJobManager.builder() + .setTimeout(Duration.millis(3000L)) + .setEndpoint("ignore") + .setManagedChannel(channel) + .build(); + } + + private static class TestJobService extends JobServiceGrpc.JobServiceImplBase { + + private Optional prepareJobResponse = Optional.empty(); + private Optional runJobResponse = Optional.empty(); + private Optional error = Optional.empty(); + + TestJobService withPrepareJobResponse(JobApi.PrepareJobResponse prepareJobResponse) { + this.prepareJobResponse = Optional.of(prepareJobResponse); + return this; + } + + TestJobService withRunJobResponse(JobApi.RunJobResponse runJobResponse) { + this.runJobResponse = Optional.of(runJobResponse); + return this; + } + + TestJobService withErrorResponse(RuntimeException error) { + this.error = Optional.of(error); + return this; + } + + @Override + public void prepare( + JobApi.PrepareJobRequest request, + StreamObserver responseObserver) { + if (prepareJobResponse.isPresent()) { + responseObserver.onNext(prepareJobResponse.get()); + responseObserver.onCompleted(); + } + if (error.isPresent()) { + responseObserver.onError(error.get()); + } + } + + @Override + public void run( + JobApi.RunJobRequest request, StreamObserver responseObserver) { + if (runJobResponse.isPresent()) { + responseObserver.onNext(runJobResponse.get()); + responseObserver.onCompleted(); + } + if (error.isPresent()) { + responseObserver.onError(error.get()); + } + } + } + + private static RunnerApi.Pipeline pipelineOf() { + Pipeline pipeline = Pipeline.create(); + pipeline.apply(Impulse.create()); + return PipelineTranslation.toProto(pipeline); + } +} From ea982127b60545164e0e280eb0d4140f35ae3156 Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud <65791736+ahmedabu98@users.noreply.github.com> Date: Wed, 7 Aug 2024 22:48:03 -0400 Subject: [PATCH 33/78] Override BQ load job location when necessary (#31986) --- .../apache_beam/io/gcp/bigquery_file_loads.py | 18 +++++++++++++++++- .../io/gcp/bigquery_file_loads_test.py | 10 ++++++++++ .../apache_beam/io/gcp/bigquery_tools.py | 8 ++++++-- 3 files changed, 33 insertions(+), 3 deletions(-) diff --git a/sdks/python/apache_beam/io/gcp/bigquery_file_loads.py b/sdks/python/apache_beam/io/gcp/bigquery_file_loads.py index e1a4af31f1c2e..3203c21a8e64a 100644 --- a/sdks/python/apache_beam/io/gcp/bigquery_file_loads.py +++ b/sdks/python/apache_beam/io/gcp/bigquery_file_loads.py @@ -765,10 +765,26 @@ def process( GlobalWindows.windowed_value((destination, job_reference))) def finish_bundle(self): + dataset_locations = {} + for windowed_value in self.pending_jobs: + table_ref = bigquery_tools.parse_table_reference(windowed_value.value[0]) + project_dataset = (table_ref.projectId, table_ref.datasetId) + job_ref = windowed_value.value[1] + # In some cases (e.g. when the load job op returns a 409 ALREADY_EXISTS), + # the returned job reference may not include a location. In such cases, + # we need to override with the dataset's location. + job_location = job_ref.location + if not job_location and project_dataset not in dataset_locations: + job_location = self.bq_wrapper.get_table_location( + table_ref.projectId, table_ref.datasetId, table_ref.tableId) + dataset_locations[project_dataset] = job_location + self.bq_wrapper.wait_for_bq_job( - job_ref, sleep_duration_sec=_SLEEP_DURATION_BETWEEN_POLLS) + job_ref, + sleep_duration_sec=_SLEEP_DURATION_BETWEEN_POLLS, + location=job_location) return self.pending_jobs diff --git a/sdks/python/apache_beam/io/gcp/bigquery_file_loads_test.py b/sdks/python/apache_beam/io/gcp/bigquery_file_loads_test.py index 0605206714ed1..f27c7899f9f38 100644 --- a/sdks/python/apache_beam/io/gcp/bigquery_file_loads_test.py +++ b/sdks/python/apache_beam/io/gcp/bigquery_file_loads_test.py @@ -426,6 +426,7 @@ def test_records_traverse_transform_with_mocks(self): job_reference = bigquery_api.JobReference() job_reference.projectId = 'project1' job_reference.jobId = 'job_name1' + job_reference.location = 'US' result_job = bigquery_api.Job() result_job.jobReference = job_reference @@ -481,6 +482,7 @@ def test_load_job_id_used(self): job_reference = bigquery_api.JobReference() job_reference.projectId = 'loadJobProject' job_reference.jobId = 'job_name1' + job_reference.location = 'US' result_job = bigquery_api.Job() result_job.jobReference = job_reference @@ -515,6 +517,7 @@ def test_load_job_id_use_for_copy_job(self): job_reference = bigquery_api.JobReference() job_reference.projectId = 'loadJobProject' job_reference.jobId = 'job_name1' + job_reference.location = 'US' result_job = mock.Mock() result_job.jobReference = job_reference @@ -567,10 +570,12 @@ def test_wait_for_load_job_completion(self, sleep_mock): job_1.jobReference = bigquery_api.JobReference() job_1.jobReference.projectId = 'project1' job_1.jobReference.jobId = 'jobId1' + job_1.jobReference.location = 'US' job_2 = bigquery_api.Job() job_2.jobReference = bigquery_api.JobReference() job_2.jobReference.projectId = 'project1' job_2.jobReference.jobId = 'jobId2' + job_2.jobReference.location = 'US' job_1_waiting = mock.Mock() job_1_waiting.status.state = 'RUNNING' @@ -610,10 +615,12 @@ def test_one_load_job_failed_after_waiting(self, sleep_mock): job_1.jobReference = bigquery_api.JobReference() job_1.jobReference.projectId = 'project1' job_1.jobReference.jobId = 'jobId1' + job_1.jobReference.location = 'US' job_2 = bigquery_api.Job() job_2.jobReference = bigquery_api.JobReference() job_2.jobReference.projectId = 'project1' job_2.jobReference.jobId = 'jobId2' + job_2.jobReference.location = 'US' job_1_waiting = mock.Mock() job_1_waiting.status.state = 'RUNNING' @@ -650,6 +657,7 @@ def test_multiple_partition_files(self): job_reference = bigquery_api.JobReference() job_reference.projectId = 'project1' job_reference.jobId = 'job_name1' + job_reference.location = 'US' result_job = mock.Mock() result_job.jobReference = job_reference @@ -732,6 +740,7 @@ def test_multiple_partition_files_write_dispositions( job_reference = bigquery_api.JobReference() job_reference.projectId = 'project1' job_reference.jobId = 'job_name1' + job_reference.location = 'US' result_job = mock.Mock() result_job.jobReference = job_reference @@ -774,6 +783,7 @@ def test_triggering_frequency(self, is_streaming, with_auto_sharding): job_reference = bigquery_api.JobReference() job_reference.projectId = 'project1' job_reference.jobId = 'job_name1' + job_reference.location = 'US' result_job = bigquery_api.Job() result_job.jobReference = job_reference diff --git a/sdks/python/apache_beam/io/gcp/bigquery_tools.py b/sdks/python/apache_beam/io/gcp/bigquery_tools.py index a92f30ec35ce4..c7128e7899ecb 100644 --- a/sdks/python/apache_beam/io/gcp/bigquery_tools.py +++ b/sdks/python/apache_beam/io/gcp/bigquery_tools.py @@ -631,7 +631,8 @@ def _start_query_job( return self._start_job(request) - def wait_for_bq_job(self, job_reference, sleep_duration_sec=5, max_retries=0): + def wait_for_bq_job( + self, job_reference, sleep_duration_sec=5, max_retries=0, location=None): """Poll job until it is DONE. Args: @@ -639,6 +640,7 @@ def wait_for_bq_job(self, job_reference, sleep_duration_sec=5, max_retries=0): sleep_duration_sec: Specifies the delay in seconds between retries. max_retries: The total number of times to retry. If equals to 0, the function waits forever. + location: Fall back on this location if job_reference doesn't have one. Raises: `RuntimeError`: If the job is FAILED or the number of retries has been @@ -648,7 +650,9 @@ def wait_for_bq_job(self, job_reference, sleep_duration_sec=5, max_retries=0): while True: retry += 1 job = self.get_job( - job_reference.projectId, job_reference.jobId, job_reference.location) + job_reference.projectId, + job_reference.jobId, + job_reference.location or location) _LOGGER.info('Job %s status: %s', job.id, job.status.state) if job.status.state == 'DONE' and job.status.errorResult: raise RuntimeError( From adc3b2b4a5ffaeac50e73bce32027c85e8637cac Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 7 Aug 2024 22:37:13 -0700 Subject: [PATCH 34/78] Bump cloud.google.com/go/bigtable from 1.25.0 to 1.28.0 in /sdks (#32105) Bumps [cloud.google.com/go/bigtable](https://github.com/googleapis/google-cloud-go) from 1.25.0 to 1.28.0. - [Release notes](https://github.com/googleapis/google-cloud-go/releases) - [Changelog](https://github.com/googleapis/google-cloud-go/blob/main/documentai/CHANGES.md) - [Commits](https://github.com/googleapis/google-cloud-go/compare/pubsub/v1.25.0...pubsub/v1.28.0) --- updated-dependencies: - dependency-name: cloud.google.com/go/bigtable dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- sdks/go.mod | 38 ++++++++++++------------ sdks/go.sum | 84 ++++++++++++++++++++++++++++------------------------- 2 files changed, 64 insertions(+), 58 deletions(-) diff --git a/sdks/go.mod b/sdks/go.mod index 654a456285431..5aeb14606c9d3 100644 --- a/sdks/go.mod +++ b/sdks/go.mod @@ -23,12 +23,12 @@ module github.com/apache/beam/sdks/v2 go 1.21 require ( - cloud.google.com/go/bigquery v1.61.0 - cloud.google.com/go/bigtable v1.25.0 + cloud.google.com/go/bigquery v1.62.0 + cloud.google.com/go/bigtable v1.28.0 cloud.google.com/go/datastore v1.17.1 cloud.google.com/go/profiler v0.4.0 - cloud.google.com/go/pubsub v1.39.0 - cloud.google.com/go/spanner v1.63.0 + cloud.google.com/go/pubsub v1.40.0 + cloud.google.com/go/spanner v1.64.0 cloud.google.com/go/storage v1.43.0 github.com/aws/aws-sdk-go-v2 v1.30.3 github.com/aws/aws-sdk-go-v2/config v1.27.4 @@ -53,13 +53,13 @@ require ( github.com/xitongsys/parquet-go v1.6.2 github.com/xitongsys/parquet-go-source v0.0.0-20220315005136-aec0fe3e777c go.mongodb.org/mongo-driver v1.16.0 - golang.org/x/net v0.26.0 + golang.org/x/net v0.27.0 golang.org/x/oauth2 v0.21.0 golang.org/x/sync v0.8.0 golang.org/x/sys v0.22.0 golang.org/x/text v0.17.0 - google.golang.org/api v0.187.0 - google.golang.org/genproto v0.0.0-20240624140628-dc46fd24d27d + google.golang.org/api v0.189.0 + google.golang.org/genproto v0.0.0-20240725223205-93522f1f2a9f google.golang.org/grpc v1.65.0 google.golang.org/protobuf v1.34.2 gopkg.in/yaml.v2 v2.4.0 @@ -74,8 +74,9 @@ require ( require ( cel.dev/expr v0.15.0 // indirect - cloud.google.com/go/auth v0.6.1 // indirect - cloud.google.com/go/auth/oauth2adapt v0.2.2 // indirect + cloud.google.com/go/auth v0.7.2 // indirect + cloud.google.com/go/auth/oauth2adapt v0.2.3 // indirect + cloud.google.com/go/monitoring v1.20.2 // indirect dario.cat/mergo v1.0.0 // indirect filippo.io/edwards25519 v1.1.0 // indirect github.com/GoogleCloudPlatform/grpc-gcp-go/grpcgcp v1.5.0 // indirect @@ -84,7 +85,7 @@ require ( github.com/containerd/errdefs v0.1.0 // indirect github.com/containerd/log v0.1.0 // indirect github.com/distribution/reference v0.6.0 // indirect - github.com/go-logr/logr v1.4.1 // indirect + github.com/go-logr/logr v1.4.2 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/go-ole/go-ole v1.2.6 // indirect github.com/golang/protobuf v1.5.4 // indirect @@ -107,15 +108,16 @@ require ( go.opentelemetry.io/otel v1.24.0 // indirect go.opentelemetry.io/otel/metric v1.24.0 // indirect go.opentelemetry.io/otel/sdk v1.24.0 // indirect + go.opentelemetry.io/otel/sdk/metric v1.24.0 // indirect go.opentelemetry.io/otel/trace v1.24.0 // indirect golang.org/x/time v0.5.0 // indirect ) require ( cloud.google.com/go v0.115.0 // indirect - cloud.google.com/go/compute/metadata v0.3.0 // indirect - cloud.google.com/go/iam v1.1.8 // indirect - cloud.google.com/go/longrunning v0.5.7 // indirect + cloud.google.com/go/compute/metadata v0.5.0 // indirect + cloud.google.com/go/iam v1.1.11 // indirect + cloud.google.com/go/longrunning v0.5.10 // indirect github.com/Azure/go-ansiterm v0.0.0-20210617225240-d185dfc1b5a1 // indirect github.com/Microsoft/go-winio v0.6.2 // indirect github.com/apache/arrow/go/arrow v0.0.0-20200730104253-651201b0f516 // indirect @@ -154,7 +156,7 @@ require ( github.com/google/renameio/v2 v2.0.0 // indirect github.com/google/s2a-go v0.1.7 // indirect github.com/googleapis/enterprise-certificate-proxy v0.3.2 // indirect - github.com/googleapis/gax-go/v2 v2.12.5 // indirect + github.com/googleapis/gax-go/v2 v2.13.0 // indirect github.com/gorilla/handlers v1.5.2 // indirect github.com/gorilla/mux v1.8.1 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect @@ -182,9 +184,9 @@ require ( github.com/zeebo/xxh3 v1.0.2 // indirect go.opencensus.io v0.24.0 // indirect golang.org/x/crypto v0.25.0 // indirect - golang.org/x/mod v0.17.0 // indirect - golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d // indirect + golang.org/x/mod v0.18.0 // indirect + golang.org/x/tools v0.22.0 // indirect golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20240617180043-68d350f18fd4 // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20240624140628-dc46fd24d27d // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20240722135656-d784300faade // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20240722135656-d784300faade // indirect ) diff --git a/sdks/go.sum b/sdks/go.sum index 1c03d5afc89cd..f0545e6c8e118 100644 --- a/sdks/go.sum +++ b/sdks/go.sum @@ -101,10 +101,10 @@ cloud.google.com/go/assuredworkloads v1.7.0/go.mod h1:z/736/oNmtGAyU47reJgGN+KVo cloud.google.com/go/assuredworkloads v1.8.0/go.mod h1:AsX2cqyNCOvEQC8RMPnoc0yEarXQk6WEKkxYfL6kGIo= cloud.google.com/go/assuredworkloads v1.9.0/go.mod h1:kFuI1P78bplYtT77Tb1hi0FMxM0vVpRC7VVoJC3ZoT0= cloud.google.com/go/assuredworkloads v1.10.0/go.mod h1:kwdUQuXcedVdsIaKgKTp9t0UJkE5+PAVNhdQm4ZVq2E= -cloud.google.com/go/auth v0.6.1 h1:T0Zw1XM5c1GlpN2HYr2s+m3vr1p2wy+8VN+Z1FKxW38= -cloud.google.com/go/auth v0.6.1/go.mod h1:eFHG7zDzbXHKmjJddFG/rBlcGp6t25SwRUiEQSlO4x4= -cloud.google.com/go/auth/oauth2adapt v0.2.2 h1:+TTV8aXpjeChS9M+aTtN/TjdQnzJvmzKFt//oWu7HX4= -cloud.google.com/go/auth/oauth2adapt v0.2.2/go.mod h1:wcYjgpZI9+Yu7LyYBg4pqSiaRkfEK3GQcpb7C/uyF1Q= +cloud.google.com/go/auth v0.7.2 h1:uiha352VrCDMXg+yoBtaD0tUF4Kv9vrtrWPYXwutnDE= +cloud.google.com/go/auth v0.7.2/go.mod h1:VEc4p5NNxycWQTMQEDQF0bd6aTMb6VgYDXEwiJJQAbs= +cloud.google.com/go/auth/oauth2adapt v0.2.3 h1:MlxF+Pd3OmSudg/b1yZ5lJwoXCEaeedAguodky1PcKI= +cloud.google.com/go/auth/oauth2adapt v0.2.3/go.mod h1:tMQXOfZzFuNuUxOypHlQEXgdfX5cuhwU+ffUuXRJE8I= cloud.google.com/go/automl v1.5.0/go.mod h1:34EjfoFGMZ5sgJ9EoLsRtdPSNZLcfflJR39VbVNS2M0= cloud.google.com/go/automl v1.6.0/go.mod h1:ugf8a6Fx+zP0D59WLhqgTDsQI9w07o64uf/Is3Nh5p8= cloud.google.com/go/automl v1.7.0/go.mod h1:RL9MYCCsJEOmt0Wf3z9uzG0a7adTT1fe+aObgSpkCt8= @@ -133,10 +133,10 @@ cloud.google.com/go/bigquery v1.47.0/go.mod h1:sA9XOgy0A8vQK9+MWhEQTY6Tix87M/Zur cloud.google.com/go/bigquery v1.48.0/go.mod h1:QAwSz+ipNgfL5jxiaK7weyOhzdoAy1zFm0Nf1fysJac= cloud.google.com/go/bigquery v1.49.0/go.mod h1:Sv8hMmTFFYBlt/ftw2uN6dFdQPzBlREY9yBh7Oy7/4Q= cloud.google.com/go/bigquery v1.50.0/go.mod h1:YrleYEh2pSEbgTBZYMJ5SuSr0ML3ypjRB1zgf7pvQLU= -cloud.google.com/go/bigquery v1.61.0 h1:w2Goy9n6gh91LVi6B2Sc+HpBl8WbWhIyzdvVvrAuEIw= -cloud.google.com/go/bigquery v1.61.0/go.mod h1:PjZUje0IocbuTOdq4DBOJLNYB0WF3pAKBHzAYyxCwFo= -cloud.google.com/go/bigtable v1.25.0 h1:P3J0qFd2BUpvnamJOaTW9KkgqAiUXsFtFAW33sxj/hU= -cloud.google.com/go/bigtable v1.25.0/go.mod h1:NOwb5o8cw2LCEMP8SthXGxpZAjbQXc4Gb7V6A3TvsJc= +cloud.google.com/go/bigquery v1.62.0 h1:SYEA2f7fKqbSRRBHb7g0iHTtZvtPSPYdXfmqsjpsBwo= +cloud.google.com/go/bigquery v1.62.0/go.mod h1:5ee+ZkF1x/ntgCsFQJAQTM3QkAZOecfCmvxhkJsWRSA= +cloud.google.com/go/bigtable v1.28.0 h1:c0wc/wy+9Chj8BooqW/zgaeslXsA5YEYl84VBmvwp+4= +cloud.google.com/go/bigtable v1.28.0/go.mod h1:avmXcmxVbLJAo9moICRYMgDyTTPoV0MA0lHKnyqV4fQ= cloud.google.com/go/billing v1.4.0/go.mod h1:g9IdKBEFlItS8bTtlrZdVLWSSdSyFUZKXNS02zKMOZY= cloud.google.com/go/billing v1.5.0/go.mod h1:mztb1tBc3QekhjSgmpf/CV4LzWXLzCArwpLmP2Gm88s= cloud.google.com/go/billing v1.6.0/go.mod h1:WoXzguj+BeHXPbKfNWkqVtDdzORazmCjraY+vrxcyvI= @@ -188,8 +188,8 @@ cloud.google.com/go/compute/metadata v0.1.0/go.mod h1:Z1VN+bulIf6bt4P/C37K4DyZYZ cloud.google.com/go/compute/metadata v0.2.0/go.mod h1:zFmK7XCadkQkj6TtorcaGlCW1hT1fIilQDwofLpJ20k= cloud.google.com/go/compute/metadata v0.2.1/go.mod h1:jgHgmJd2RKBGzXqF5LR2EZMGxBkeanZ9wwa75XHJgOM= cloud.google.com/go/compute/metadata v0.2.3/go.mod h1:VAV5nSsACxMJvgaAuX6Pk2AawlZn8kiOGuCv6gTkwuA= -cloud.google.com/go/compute/metadata v0.3.0 h1:Tz+eQXMEqDIKRsmY3cHTL6FVaynIjX2QxYC4trgAKZc= -cloud.google.com/go/compute/metadata v0.3.0/go.mod h1:zFmK7XCadkQkj6TtorcaGlCW1hT1fIilQDwofLpJ20k= +cloud.google.com/go/compute/metadata v0.5.0 h1:Zr0eK8JbFv6+Wi4ilXAR8FJ3wyNdpxHKJNPos6LTZOY= +cloud.google.com/go/compute/metadata v0.5.0/go.mod h1:aHnloV2TPI38yx4s9+wAZhHykWvVCfu7hQbF+9CWoiY= cloud.google.com/go/contactcenterinsights v1.3.0/go.mod h1:Eu2oemoePuEFc/xKFPjbTuPSj0fYJcPls9TFlPNnHHY= cloud.google.com/go/contactcenterinsights v1.4.0/go.mod h1:L2YzkGbPsv+vMQMCADxJoT9YiTTnSEd6fEvCeHTYVck= cloud.google.com/go/contactcenterinsights v1.6.0/go.mod h1:IIDlT6CLcDoyv79kDv8iWxMSTZhLxSCofVV5W6YFM/w= @@ -210,8 +210,8 @@ cloud.google.com/go/datacatalog v1.8.0/go.mod h1:KYuoVOv9BM8EYz/4eMFxrr4DUKhGIOX cloud.google.com/go/datacatalog v1.8.1/go.mod h1:RJ58z4rMp3gvETA465Vg+ag8BGgBdnRPEMMSTr5Uv+M= cloud.google.com/go/datacatalog v1.12.0/go.mod h1:CWae8rFkfp6LzLumKOnmVh4+Zle4A3NXLzVJ1d1mRm0= cloud.google.com/go/datacatalog v1.13.0/go.mod h1:E4Rj9a5ZtAxcQJlEBTLgMTphfP11/lNaAshpoBgemX8= -cloud.google.com/go/datacatalog v1.20.1 h1:czcba5mxwRM5V//jSadyig0y+8aOHmN7gUl9GbHu59E= -cloud.google.com/go/datacatalog v1.20.1/go.mod h1:Jzc2CoHudhuZhpv78UBAjMEg3w7I9jHA11SbRshWUjk= +cloud.google.com/go/datacatalog v1.20.4 h1:nUR7JBPZezl1+o+86N01VxAQQHY+It/D8tmNipcdVjI= +cloud.google.com/go/datacatalog v1.20.4/go.mod h1:71PDwywIYkNgSXdUU3H0mkTp3j15aahfYJ1CY3DogtU= cloud.google.com/go/dataflow v0.6.0/go.mod h1:9QwV89cGoxjjSR9/r7eFDqqjtvbKxAK2BaYU6PVk9UM= cloud.google.com/go/dataflow v0.7.0/go.mod h1:PX526vb4ijFMesO1o202EaUmouZKBpjHsTlCtB4parQ= cloud.google.com/go/dataflow v0.8.0/go.mod h1:Rcf5YgTKPtQyYz8bLYhFoIV/vP39eL7fWNcSOyFfLJE= @@ -327,8 +327,8 @@ cloud.google.com/go/iam v0.8.0/go.mod h1:lga0/y3iH6CX7sYqypWJ33hf7kkfXJag67naqGE cloud.google.com/go/iam v0.11.0/go.mod h1:9PiLDanza5D+oWFZiH1uG+RnRCfEGKoyl6yo4cgWZGY= cloud.google.com/go/iam v0.12.0/go.mod h1:knyHGviacl11zrtZUoDuYpDgLjvr28sLQaG0YB2GYAY= cloud.google.com/go/iam v0.13.0/go.mod h1:ljOg+rcNfzZ5d6f1nAUJ8ZIxOaZUVoS14bKCtaLZ/D0= -cloud.google.com/go/iam v1.1.8 h1:r7umDwhj+BQyz0ScZMp4QrGXjSTI3ZINnpgU2nlB/K0= -cloud.google.com/go/iam v1.1.8/go.mod h1:GvE6lyMmfxXauzNq8NbgJbeVQNspG+tcdL/W8QO1+zE= +cloud.google.com/go/iam v1.1.11 h1:0mQ8UKSfdHLut6pH9FM3bI55KWR46ketn0PuXleDyxw= +cloud.google.com/go/iam v1.1.11/go.mod h1:biXoiLWYIKntto2joP+62sd9uW5EpkZmKIvfNcTWlnQ= cloud.google.com/go/iap v1.4.0/go.mod h1:RGFwRJdihTINIe4wZ2iCP0zF/qu18ZwyKxrhMhygBEc= cloud.google.com/go/iap v1.5.0/go.mod h1:UH/CGgKd4KyohZL5Pt0jSKE4m3FR51qg6FKQ/z/Ix9A= cloud.google.com/go/iap v1.6.0/go.mod h1:NSuvI9C/j7UdjGjIde7t7HBz+QTwBcapPE07+sSRcLk= @@ -348,8 +348,8 @@ cloud.google.com/go/kms v1.8.0/go.mod h1:4xFEhYFqvW+4VMELtZyxomGSYtSQKzM178ylFW4 cloud.google.com/go/kms v1.9.0/go.mod h1:qb1tPTgfF9RQP8e1wq4cLFErVuTJv7UsSC915J8dh3w= cloud.google.com/go/kms v1.10.0/go.mod h1:ng3KTUtQQU9bPX3+QGLsflZIHlkbn8amFAMY63m8d24= cloud.google.com/go/kms v1.10.1/go.mod h1:rIWk/TryCkR59GMC3YtHtXeLzd634lBbKenvyySAyYI= -cloud.google.com/go/kms v1.18.0 h1:pqNdaVmZJFP+i8OVLocjfpdTWETTYa20FWOegSCdrRo= -cloud.google.com/go/kms v1.18.0/go.mod h1:DyRBeWD/pYBMeyiaXFa/DGNyxMDL3TslIKb8o/JkLkw= +cloud.google.com/go/kms v1.18.3 h1:8+Z2S4bQDSCdghB5ZA5dVDDJTLmnkRlowtFiXqMFd74= +cloud.google.com/go/kms v1.18.3/go.mod h1:y/Lcf6fyhbdn7MrG1VaDqXxM8rhOBc5rWcWAhcvZjQU= cloud.google.com/go/language v1.4.0/go.mod h1:F9dRpNFQmJbkaop6g0JhSBXCNlO90e1KWx5iDdxbWic= cloud.google.com/go/language v1.6.0/go.mod h1:6dJ8t3B+lUYfStgls25GusK04NLh3eDLQnWM3mdEbhI= cloud.google.com/go/language v1.7.0/go.mod h1:DJ6dYN/W+SQOjF8e1hLQXMF21AkH2w9wiPzPCJa2MIE= @@ -363,8 +363,8 @@ cloud.google.com/go/logging v1.7.0/go.mod h1:3xjP2CjkM3ZkO73aj4ASA5wRPGGCRrPIAeN cloud.google.com/go/longrunning v0.1.1/go.mod h1:UUFxuDWkv22EuY93jjmDMFT5GPQKeFVJBIF6QlTqdsE= cloud.google.com/go/longrunning v0.3.0/go.mod h1:qth9Y41RRSUE69rDcOn6DdK3HfQfsUI0YSmW3iIlLJc= cloud.google.com/go/longrunning v0.4.1/go.mod h1:4iWDqhBZ70CvZ6BfETbvam3T8FMvLK+eFj0E6AaRQTo= -cloud.google.com/go/longrunning v0.5.7 h1:WLbHekDbjK1fVFD3ibpFFVoyizlLRl73I7YKuAKilhU= -cloud.google.com/go/longrunning v0.5.7/go.mod h1:8GClkudohy1Fxm3owmBGid8W0pSgodEMwEAztp38Xng= +cloud.google.com/go/longrunning v0.5.10 h1:eB/BniENNRKhjz/xgiillrdcH3G74TGSl3BXinGlI7E= +cloud.google.com/go/longrunning v0.5.10/go.mod h1:tljz5guTr5oc/qhlUjBlk7UAIFMOGuPNxkNDZXlLics= cloud.google.com/go/managedidentities v1.3.0/go.mod h1:UzlW3cBOiPrzucO5qWkNkh0w33KFtBJU281hacNvsdE= cloud.google.com/go/managedidentities v1.4.0/go.mod h1:NWSBYbEMgqmbZsLIyKvxrYbtqOsxY1ZrGM+9RgDqInM= cloud.google.com/go/managedidentities v1.5.0/go.mod h1:+dWcZ0JlUmpuxpIDfyP5pP5y0bLdRwOS4Lp7gMni/LA= @@ -388,6 +388,8 @@ cloud.google.com/go/monitoring v1.7.0/go.mod h1:HpYse6kkGo//7p6sT0wsIC6IBDET0RhI cloud.google.com/go/monitoring v1.8.0/go.mod h1:E7PtoMJ1kQXWxPjB6mv2fhC5/15jInuulFdYYtlcvT4= cloud.google.com/go/monitoring v1.12.0/go.mod h1:yx8Jj2fZNEkL/GYZyTLS4ZtZEZN8WtDEiEqG4kLK50w= cloud.google.com/go/monitoring v1.13.0/go.mod h1:k2yMBAB1H9JT/QETjNkgdCGD9bPF712XiLTVr+cBrpw= +cloud.google.com/go/monitoring v1.20.2 h1:B/L+xrw9PYO7ywh37sgnjI/6dzEE+yQTAwfytDcpPto= +cloud.google.com/go/monitoring v1.20.2/go.mod h1:36rpg/7fdQ7NX5pG5x1FA7cXTVXusOp6Zg9r9e1+oek= cloud.google.com/go/networkconnectivity v1.4.0/go.mod h1:nOl7YL8odKyAOtzNX73/M5/mGZgqqMeryi6UPZTk/rA= cloud.google.com/go/networkconnectivity v1.5.0/go.mod h1:3GzqJx7uhtlM3kln0+x5wyFvuVH1pIBJjhCpjzSt75o= cloud.google.com/go/networkconnectivity v1.6.0/go.mod h1:OJOoEXW+0LAxHh89nXd64uGG+FbQoeH8DtxCHVOMlaM= @@ -447,8 +449,8 @@ cloud.google.com/go/pubsub v1.26.0/go.mod h1:QgBH3U/jdJy/ftjPhTkyXNj543Tin1pRYcd cloud.google.com/go/pubsub v1.27.1/go.mod h1:hQN39ymbV9geqBnfQq6Xf63yNhUAhv9CZhzp5O6qsW0= cloud.google.com/go/pubsub v1.28.0/go.mod h1:vuXFpwaVoIPQMGXqRyUQigu/AX1S3IWugR9xznmcXX8= cloud.google.com/go/pubsub v1.30.0/go.mod h1:qWi1OPS0B+b5L+Sg6Gmc9zD1Y+HaM0MdUr7LsupY1P4= -cloud.google.com/go/pubsub v1.39.0 h1:qt1+S6H+wwW8Q/YvDwM8lJnq+iIFgFEgaD/7h3lMsAI= -cloud.google.com/go/pubsub v1.39.0/go.mod h1:FrEnrSGU6L0Kh3iBaAbIUM8KMR7LqyEkMboVxGXCT+s= +cloud.google.com/go/pubsub v1.40.0 h1:0LdP+zj5XaPAGtWr2V6r88VXJlmtaB/+fde1q3TU8M0= +cloud.google.com/go/pubsub v1.40.0/go.mod h1:BVJI4sI2FyXp36KFKvFwcfDRDfR8MiLT8mMhmIhdAeA= cloud.google.com/go/pubsublite v1.5.0/go.mod h1:xapqNQ1CuLfGi23Yda/9l4bBCKz/wC3KIJ5gKcxveZg= cloud.google.com/go/pubsublite v1.6.0/go.mod h1:1eFCS0U11xlOuMFV/0iBqw3zP12kddMeCbj/F3FSj9k= cloud.google.com/go/pubsublite v1.7.0/go.mod h1:8hVMwRXfDfvGm3fahVbtDbiLePT3gpoiJYJY+vxWxVM= @@ -538,8 +540,8 @@ cloud.google.com/go/shell v1.6.0/go.mod h1:oHO8QACS90luWgxP3N9iZVuEiSF84zNyLytb+ cloud.google.com/go/spanner v1.41.0/go.mod h1:MLYDBJR/dY4Wt7ZaMIQ7rXOTLjYrmxLE/5ve9vFfWos= cloud.google.com/go/spanner v1.44.0/go.mod h1:G8XIgYdOK+Fbcpbs7p2fiprDw4CaZX63whnSMLVBxjk= cloud.google.com/go/spanner v1.45.0/go.mod h1:FIws5LowYz8YAE1J8fOS7DJup8ff7xJeetWEo5REA2M= -cloud.google.com/go/spanner v1.63.0 h1:P6+BY70Wtol4MtryBgnXZVTZfsdySEvWfz0EpyLwHi4= -cloud.google.com/go/spanner v1.63.0/go.mod h1:iqDx7urZpgD7RekZ+CFvBRH6kVTW1ZSEb2HMDKOp5Cc= +cloud.google.com/go/spanner v1.64.0 h1:ltyPbHA/nRAtAhU/o742dXBCI1eNHPeaRY09Ja8B+hM= +cloud.google.com/go/spanner v1.64.0/go.mod h1:TOFx3pb2UwPsDGlE1gTehW+y6YlU4IFk+VdDHSGQS/M= cloud.google.com/go/speech v1.6.0/go.mod h1:79tcr4FHCimOp56lwC01xnt/WPJZc4v3gzyT7FoBkCM= cloud.google.com/go/speech v1.7.0/go.mod h1:KptqL+BAQIhMsj1kOP2la5DSEEerPDuOP/2mmkhHhZQ= cloud.google.com/go/speech v1.8.0/go.mod h1:9bYIl1/tjsAnMgKGHKmBZzXKEkGgtU+MpdDPTE9f7y0= @@ -827,8 +829,8 @@ github.com/go-gorp/gorp v2.2.0+incompatible/go.mod h1:7IfkAQnO7jfT/9IQ3R9wL1dFhu github.com/go-latex/latex v0.0.0-20210118124228-b3d85cf34e07/go.mod h1:CO1AlKB2CSIqUrmQPqA0gdRIlnLEY0gK5JGjh37zN5U= github.com/go-latex/latex v0.0.0-20210823091927-c0d11ff05a81/go.mod h1:SX0U8uGpxhq9o2S/CELCSUxEWWAuoCUcVCQWv7G2OCk= github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= -github.com/go-logr/logr v1.4.1 h1:pKouT5E8xu9zeFC39JXRDukb6JFQPXM5p5I91188VAQ= -github.com/go-logr/logr v1.4.1/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= +github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY= @@ -966,8 +968,8 @@ github.com/googleapis/gax-go/v2 v2.5.1/go.mod h1:h6B0KMMFNtI2ddbGJn3T3ZbwkeT6yqE github.com/googleapis/gax-go/v2 v2.6.0/go.mod h1:1mjbznJAPHFpesgE5ucqfYEscaz5kMdcIDwU/6+DDoY= github.com/googleapis/gax-go/v2 v2.7.0/go.mod h1:TEop28CZZQ2y+c0VxMUmu1lV+fQx57QpBWsYpwqHJx8= github.com/googleapis/gax-go/v2 v2.7.1/go.mod h1:4orTrqY6hXxxaUL4LHIPl6lGo8vAE38/qKbhSAKP6QI= -github.com/googleapis/gax-go/v2 v2.12.5 h1:8gw9KZK8TiVKB6q3zHY3SBzLnrGp6HQjyfYBYGmXdxA= -github.com/googleapis/gax-go/v2 v2.12.5/go.mod h1:BUDKcWo+RaKq5SC9vVYL0wLADa3VcfswbOMMRmB9H3E= +github.com/googleapis/gax-go/v2 v2.13.0 h1:yitjD5f7jQHhyDsnhKEBU52NdvvdSeGzlAnDPT0hH1s= +github.com/googleapis/gax-go/v2 v2.13.0/go.mod h1:Z/fvTZXF8/uw7Xu5GuslPw+bplx6SS338j1Is2S+B7A= github.com/googleapis/go-type-adapters v1.0.0/go.mod h1:zHW75FOG2aur7gAO2B+MLby+cLsWGBF62rFAi7WjWO4= github.com/googleapis/google-cloud-go-testing v0.0.0-20200911160855-bcd43fbb19e8/go.mod h1:dvDLG8qkwmyD9a/MJJN3XJcT3xFxOKAvTZGvuZmac9g= github.com/gorilla/handlers v1.5.2 h1:cLTUSsNkgcwhgRqvCNmdbRWG0A3N4F+M2nWKdScwyEE= @@ -1232,6 +1234,8 @@ go.opentelemetry.io/otel/metric v1.24.0 h1:6EhoGWWK28x1fbpA4tYTOWBkPefTDQnb8WSGX go.opentelemetry.io/otel/metric v1.24.0/go.mod h1:VYhLe1rFfxuTXLgj4CBiyz+9WYBA8pNGJgDcSFRKBco= go.opentelemetry.io/otel/sdk v1.24.0 h1:YMPPDNymmQN3ZgczicBY3B6sf9n62Dlj9pWD3ucgoDw= go.opentelemetry.io/otel/sdk v1.24.0/go.mod h1:KVrIYw6tEubO9E96HQpcmpTKDVn9gdv35HoYiQWGDFg= +go.opentelemetry.io/otel/sdk/metric v1.24.0 h1:yyMQrPzF+k88/DbH7o4FMAs80puqd+9osbiBrJrz/w8= +go.opentelemetry.io/otel/sdk/metric v1.24.0/go.mod h1:I6Y5FjH6rvEnTTAYQz3Mmv2kl6Ek5IIrmwTLqMrrOE0= go.opentelemetry.io/otel/trace v1.24.0 h1:CsKnnL4dUAr/0llH9FKuc698G04IrpWV0MQA/Y1YELI= go.opentelemetry.io/otel/trace v1.24.0/go.mod h1:HPc3Xr/cOApsBI154IU0OI0HJexz+aw5uPdbs3UCjNU= go.opentelemetry.io/proto/otlp v0.7.0/go.mod h1:PqfVotwruBrMGOCsRd/89rSnXhoiJIqeYNgFYFoEGnI= @@ -1312,8 +1316,8 @@ golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91 golang.org/x/mod v0.7.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/mod v0.9.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= -golang.org/x/mod v0.17.0 h1:zY54UmvipHiNd+pm+m0x9KhZ9hl1/7QNMyxXbc6ICqA= -golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/mod v0.18.0 h1:5+9lSbEzPSdWkH32vYPBwEpX8KwDbM52Ud9xBUvNlb0= +golang.org/x/mod v0.18.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -1373,8 +1377,8 @@ golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc= golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= -golang.org/x/net v0.26.0 h1:soB7SVo0PWrY4vPW/+ay0jKDNScG2X9wFeYlXIvJsOQ= -golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE= +golang.org/x/net v0.27.0 h1:5K3Njcw06/l2y9vpGCSdcxWOYHOUk3dVNGDXN+FvAys= +golang.org/x/net v0.27.0/go.mod h1:dDi0PyhWNoiUOrAS8uXv/vnScO4wnHQO4mj9fn/RytE= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= @@ -1613,8 +1617,8 @@ golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc golang.org/x/tools v0.3.0/go.mod h1:/rWhSS2+zyEVwoJf8YAX6L2f0ntZ7Kn/mGgAWcipA5k= golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= golang.org/x/tools v0.7.0/go.mod h1:4pg6aUX35JBAogB10C9AtvVL+qowtN4pT3CGSQex14s= -golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d h1:vU5i/LfpvrRCpgM/VPfJLg5KjxD3E+hfT1SH+d9zLwg= -golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= +golang.org/x/tools v0.22.0 h1:gqSGLZqv+AI9lIQzniJ0nZDRG5GBPsSi+DRNHWNz6yA= +golang.org/x/tools v0.22.0/go.mod h1:aCwcsjqvq7Yqt6TNyX7QMU2enbQ/Gt0bo6krSeEri+c= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= @@ -1692,8 +1696,8 @@ google.golang.org/api v0.108.0/go.mod h1:2Ts0XTHNVWxypznxWOYUeI4g3WdP9Pk2Qk58+a/ google.golang.org/api v0.110.0/go.mod h1:7FC4Vvx1Mooxh8C5HWjzZHcavuS2f6pmJpZx60ca7iI= google.golang.org/api v0.111.0/go.mod h1:qtFHvU9mhgTJegR31csQ+rwxyUTHOKFqCKWp1J0fdw0= google.golang.org/api v0.114.0/go.mod h1:ifYI2ZsFK6/uGddGfAD5BMxlnkBqCmqHSDUVi45N5Yg= -google.golang.org/api v0.187.0 h1:Mxs7VATVC2v7CY+7Xwm4ndkX71hpElcvx0D1Ji/p1eo= -google.golang.org/api v0.187.0/go.mod h1:KIHlTc4x7N7gKKuVsdmfBXN13yEEWXWFURWY6SBp2gk= +google.golang.org/api v0.189.0 h1:equMo30LypAkdkLMBqfeIqtyAnlyig1JSZArl4XPwdI= +google.golang.org/api v0.189.0/go.mod h1:FLWGJKb0hb+pU2j+rJqwbnsF+ym+fQs73rbJ+KAUgy8= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= @@ -1833,12 +1837,12 @@ google.golang.org/genproto v0.0.0-20230323212658-478b75c54725/go.mod h1:UUQDJDOl google.golang.org/genproto v0.0.0-20230330154414-c0448cd141ea/go.mod h1:UUQDJDOlWu4KYeJZffbWgBkS1YFobzKbLVfK69pe0Ak= google.golang.org/genproto v0.0.0-20230331144136-dcfb400f0633/go.mod h1:UUQDJDOlWu4KYeJZffbWgBkS1YFobzKbLVfK69pe0Ak= google.golang.org/genproto v0.0.0-20230410155749-daa745c078e1/go.mod h1:nKE/iIaLqn2bQwXBg8f1g2Ylh6r5MN5CmZvuzZCgsCU= -google.golang.org/genproto v0.0.0-20240624140628-dc46fd24d27d h1:PksQg4dV6Sem3/HkBX+Ltq8T0ke0PKIRBNBatoDTVls= -google.golang.org/genproto v0.0.0-20240624140628-dc46fd24d27d/go.mod h1:s7iA721uChleev562UJO2OYB0PPT9CMFjV+Ce7VJH5M= -google.golang.org/genproto/googleapis/api v0.0.0-20240617180043-68d350f18fd4 h1:MuYw1wJzT+ZkybKfaOXKp5hJiZDn2iHaXRw0mRYdHSc= -google.golang.org/genproto/googleapis/api v0.0.0-20240617180043-68d350f18fd4/go.mod h1:px9SlOOZBg1wM1zdnr8jEL4CNGUBZ+ZKYtNPApNQc4c= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240624140628-dc46fd24d27d h1:k3zyW3BYYR30e8v3x0bTDdE9vpYFjZHK+HcyqkrppWk= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240624140628-dc46fd24d27d/go.mod h1:Ue6ibwXGpU+dqIcODieyLOcgj7z8+IcskoNIgZxtrFY= +google.golang.org/genproto v0.0.0-20240725223205-93522f1f2a9f h1:htT2I9bZvGm+110zq8bIErMX+WgBWxCzV3ChwbvnKnc= +google.golang.org/genproto v0.0.0-20240725223205-93522f1f2a9f/go.mod h1:Sk3mLpoDFTAp6R4OvlcUgaG4ISTspKeFsIAXMn9Bm4Y= +google.golang.org/genproto/googleapis/api v0.0.0-20240722135656-d784300faade h1:WxZOF2yayUHpHSbUE6NMzumUzBxYc3YGwo0YHnbzsJY= +google.golang.org/genproto/googleapis/api v0.0.0-20240722135656-d784300faade/go.mod h1:mw8MG/Qz5wfgYr6VqVCiZcHe/GJEfI+oGGDCohaVgB0= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240722135656-d784300faade h1:oCRSWfwGXQsqlVdErcyTt4A93Y8fo0/9D4b1gnI++qo= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240722135656-d784300faade/go.mod h1:Ue6ibwXGpU+dqIcODieyLOcgj7z8+IcskoNIgZxtrFY= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= From 1f09065ff325763464dfd618c4175a903f68301a Mon Sep 17 00:00:00 2001 From: Yi Hu Date: Thu, 8 Aug 2024 08:12:31 -0400 Subject: [PATCH 35/78] Fix classifier dropped in artifact pom.xml (#32100) --- .../groovy/org/apache/beam/gradle/BeamModulePlugin.groovy | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy b/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy index e603e49f842fe..ee116423e4b00 100644 --- a/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy +++ b/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy @@ -1978,8 +1978,8 @@ class BeamModulePlugin implements Plugin { def dependencyNode = dependenciesNode.appendNode('dependency') def appendClassifier = { dep -> dep.artifacts.each { art -> - if (art.hasProperty('archiveClassifier')) { - dependencyNode.appendNode('archiveClassifier', art.archiveClassifier) + if (art.hasProperty('classifier')) { + dependencyNode.appendNode('classifier', art.classifier) } } } From 502c728dd23e8b93691c87bd3b597d017782b418 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 8 Aug 2024 10:29:28 -0400 Subject: [PATCH 36/78] Update cython requirement from <1 to <4 in /sdks/python (#32087) Updates the requirements on [cython](https://github.com/cython/cython) to permit the latest version. - [Release notes](https://github.com/cython/cython/releases) - [Changelog](https://github.com/cython/cython/blob/master/CHANGES.rst) - [Commits](https://github.com/cython/cython/compare/0.9.6.14...3.0.11) --- updated-dependencies: - dependency-name: cython dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- sdks/python/container/base_image_requirements_manual.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/python/container/base_image_requirements_manual.txt b/sdks/python/container/base_image_requirements_manual.txt index 5bc60c474b4fe..ab5dcf30aa50e 100644 --- a/sdks/python/container/base_image_requirements_manual.txt +++ b/sdks/python/container/base_image_requirements_manual.txt @@ -27,7 +27,7 @@ bs4 # Commonly used HTML processing tool. # Don't upgrade to Cython 3.x, until it's released, stable and we have consensus # to upgrade. Use 0.xx for now. -cython<1 +cython<4 # future is no longer a Beam dependency, but is an implicit dependency in # some versions of libraries that launch Beam pipelines, like tensorflow-transform. # Leaving 'future' in our containers for now prevent breaking tft users. From 529996241be1bd4482a529f88b742eeee9867daa Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 8 Aug 2024 11:22:07 -0400 Subject: [PATCH 37/78] Bump go.mongodb.org/mongo-driver from 1.16.0 to 1.16.1 in /sdks (#32104) Bumps [go.mongodb.org/mongo-driver](https://github.com/mongodb/mongo-go-driver) from 1.16.0 to 1.16.1. - [Release notes](https://github.com/mongodb/mongo-go-driver/releases) - [Commits](https://github.com/mongodb/mongo-go-driver/compare/v1.16.0...v1.16.1) --- updated-dependencies: - dependency-name: go.mongodb.org/mongo-driver dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- sdks/go.mod | 2 +- sdks/go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sdks/go.mod b/sdks/go.mod index 5aeb14606c9d3..397820f94cb77 100644 --- a/sdks/go.mod +++ b/sdks/go.mod @@ -52,7 +52,7 @@ require ( github.com/tetratelabs/wazero v1.7.3 github.com/xitongsys/parquet-go v1.6.2 github.com/xitongsys/parquet-go-source v0.0.0-20220315005136-aec0fe3e777c - go.mongodb.org/mongo-driver v1.16.0 + go.mongodb.org/mongo-driver v1.16.1 golang.org/x/net v0.27.0 golang.org/x/oauth2 v0.21.0 golang.org/x/sync v0.8.0 diff --git a/sdks/go.sum b/sdks/go.sum index f0545e6c8e118..05de032d212ab 100644 --- a/sdks/go.sum +++ b/sdks/go.sum @@ -1209,8 +1209,8 @@ github.com/zeebo/xxh3 v1.0.2/go.mod h1:5NWz9Sef7zIDm2JHfFlcQvNekmcEl9ekUZQQKCYaD go.einride.tech/aip v0.67.1 h1:d/4TW92OxXBngkSOwWS2CH5rez869KpKMaN44mdxkFI= go.einride.tech/aip v0.67.1/go.mod h1:ZGX4/zKw8dcgzdLsrvpOOGxfxI2QSk12SlP7d6c0/XI= go.etcd.io/bbolt v1.3.5/go.mod h1:G5EMThwa9y8QZGBClrRx5EY+Yw9kAhnjy3bSjsnlVTQ= -go.mongodb.org/mongo-driver v1.16.0 h1:tpRsfBJMROVHKpdGyc1BBEzzjDUWjItxbVSZ8Ls4BQ4= -go.mongodb.org/mongo-driver v1.16.0/go.mod h1:oB6AhJQvFQL4LEHyXi6aJzQJtBiTQHiAd83l0GdFaiw= +go.mongodb.org/mongo-driver v1.16.1 h1:rIVLL3q0IHM39dvE+z2ulZLp9ENZKThVfuvN/IiN4l8= +go.mongodb.org/mongo-driver v1.16.1/go.mod h1:oB6AhJQvFQL4LEHyXi6aJzQJtBiTQHiAd83l0GdFaiw= go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU= go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8= go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= From 16d95835df1b367892ca1e6895306af29d4b81c5 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 8 Aug 2024 11:22:25 -0400 Subject: [PATCH 38/78] Bump github.com/proullon/ramsql from 0.1.3 to 0.1.4 in /sdks (#32106) Bumps [github.com/proullon/ramsql](https://github.com/proullon/ramsql) from 0.1.3 to 0.1.4. - [Release notes](https://github.com/proullon/ramsql/releases) - [Commits](https://github.com/proullon/ramsql/compare/v0.1.3...v0.1.4) --- updated-dependencies: - dependency-name: github.com/proullon/ramsql dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- sdks/go.mod | 2 +- sdks/go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sdks/go.mod b/sdks/go.mod index 397820f94cb77..26c1d9d376902 100644 --- a/sdks/go.mod +++ b/sdks/go.mod @@ -46,7 +46,7 @@ require ( github.com/linkedin/goavro/v2 v2.13.0 github.com/nats-io/nats-server/v2 v2.10.18 github.com/nats-io/nats.go v1.36.0 - github.com/proullon/ramsql v0.1.3 + github.com/proullon/ramsql v0.1.4 github.com/spf13/cobra v1.8.1 github.com/testcontainers/testcontainers-go v0.32.0 github.com/tetratelabs/wazero v1.7.3 diff --git a/sdks/go.sum b/sdks/go.sum index 05de032d212ab..1f57de7507cd8 100644 --- a/sdks/go.sum +++ b/sdks/go.sum @@ -1117,8 +1117,8 @@ github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c/go.mod h1:Om github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= github.com/prometheus/client_model v0.2.0/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= github.com/prometheus/client_model v0.3.0/go.mod h1:LDGWKZIo7rky3hgvBe+caln+Dr3dPggB5dvjtD7w9+w= -github.com/proullon/ramsql v0.1.3 h1:/LRcXJf4lEmhdb4tYcci473I2VynjcZSzh2hsjJ8rSk= -github.com/proullon/ramsql v0.1.3/go.mod h1:CFGqeQHQpdRfWqYmWD3yXqPTEaHkF4zgXy1C6qDWc9E= +github.com/proullon/ramsql v0.1.4 h1:yTFRTn46gFH/kPbzCx+mGjuFlyTBUeDr3h2ldwxddl0= +github.com/proullon/ramsql v0.1.4/go.mod h1:CFGqeQHQpdRfWqYmWD3yXqPTEaHkF4zgXy1C6qDWc9E= github.com/remyoudompheng/bigfft v0.0.0-20200410134404-eec4a21b6bb0/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo= github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ= github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= From 18849de0cc4e6aff82af6ed897fa7baec9deb084 Mon Sep 17 00:00:00 2001 From: Danny McCormick Date: Thu, 8 Aug 2024 17:46:13 +0200 Subject: [PATCH 39/78] Revert "Update cython requirement from <1 to <4 in /sdks/python (#32087)" (#32110) This reverts commit 502c728dd23e8b93691c87bd3b597d017782b418. --- sdks/python/container/base_image_requirements_manual.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/python/container/base_image_requirements_manual.txt b/sdks/python/container/base_image_requirements_manual.txt index ab5dcf30aa50e..5bc60c474b4fe 100644 --- a/sdks/python/container/base_image_requirements_manual.txt +++ b/sdks/python/container/base_image_requirements_manual.txt @@ -27,7 +27,7 @@ bs4 # Commonly used HTML processing tool. # Don't upgrade to Cython 3.x, until it's released, stable and we have consensus # to upgrade. Use 0.xx for now. -cython<4 +cython<1 # future is no longer a Beam dependency, but is an implicit dependency in # some versions of libraries that launch Beam pipelines, like tensorflow-transform. # Leaving 'future' in our containers for now prevent breaking tft users. From a6de47572b927c8a1c3fdaf11f15b6d02473c3e8 Mon Sep 17 00:00:00 2001 From: Robert Burke Date: Thu, 8 Aug 2024 10:08:21 -0700 Subject: [PATCH 40/78] [Go SDK] s3 filesystem: Fix nillable content length, update deps. (#32111) Co-authored-by: lostluck <13907733+lostluck@users.noreply.github.com> --- sdks/go.mod | 14 +++++++------- sdks/go.sum | 14 ++++++++++++++ sdks/go/pkg/beam/io/filesystem/s3/s3.go | 10 +++++++--- 3 files changed, 28 insertions(+), 10 deletions(-) diff --git a/sdks/go.mod b/sdks/go.mod index 26c1d9d376902..bd25b3beab5aa 100644 --- a/sdks/go.mod +++ b/sdks/go.mod @@ -31,10 +31,10 @@ require ( cloud.google.com/go/spanner v1.64.0 cloud.google.com/go/storage v1.43.0 github.com/aws/aws-sdk-go-v2 v1.30.3 - github.com/aws/aws-sdk-go-v2/config v1.27.4 + github.com/aws/aws-sdk-go-v2/config v1.27.27 github.com/aws/aws-sdk-go-v2/credentials v1.17.27 - github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.13.8 - github.com/aws/aws-sdk-go-v2/service/s3 v1.42.2 + github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.17.10 + github.com/aws/aws-sdk-go-v2/service/s3 v1.58.3 github.com/aws/smithy-go v1.20.3 github.com/docker/go-connections v0.5.0 github.com/dustin/go-humanize v1.0.1 @@ -123,16 +123,16 @@ require ( github.com/apache/arrow/go/arrow v0.0.0-20200730104253-651201b0f516 // indirect github.com/apache/thrift v0.17.0 // indirect github.com/aws/aws-sdk-go v1.34.0 // indirect - github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.5.1 // indirect + github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.3 // indirect github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.11 // indirect github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.15 // indirect github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.15 // indirect github.com/aws/aws-sdk-go-v2/internal/ini v1.8.0 // indirect - github.com/aws/aws-sdk-go-v2/internal/v4a v1.2.3 // indirect + github.com/aws/aws-sdk-go-v2/internal/v4a v1.3.15 // indirect github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.11.3 // indirect - github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.2.3 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.3.17 // indirect github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.11.17 // indirect - github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.16.3 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.17.15 // indirect github.com/aws/aws-sdk-go-v2/service/sso v1.22.4 // indirect github.com/aws/aws-sdk-go-v2/service/ssooidc v1.26.4 // indirect github.com/aws/aws-sdk-go-v2/service/sts v1.30.3 // indirect diff --git a/sdks/go.sum b/sdks/go.sum index 1f57de7507cd8..a50c8ce9230c9 100644 --- a/sdks/go.sum +++ b/sdks/go.sum @@ -683,9 +683,13 @@ github.com/aws/aws-sdk-go-v2 v1.30.3 h1:jUeBtG0Ih+ZIFH0F4UkmL9w3cSpaMv9tYYDbzILP github.com/aws/aws-sdk-go-v2 v1.30.3/go.mod h1:nIQjQVp5sfpQcTc9mPSr1B0PaWK5ByX9MOoDadSN4lc= github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.5.1 h1:ZY3108YtBNq96jNZTICHxN1gSBSbnvIdYwwqnvCV4Mc= github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.5.1/go.mod h1:t8PYl/6LzdAqsU4/9tz28V/kU+asFePvpOMkdul0gEQ= +github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.3 h1:tW1/Rkad38LA15X4UQtjXZXNKsCgkshC3EbmcUmghTg= +github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.3/go.mod h1:UbnqO+zjqk3uIt9yCACHJ9IVNhyhOCnYk8yA19SAWrM= github.com/aws/aws-sdk-go-v2/config v1.5.0/go.mod h1:RWlPOAW3E3tbtNAqTwvSW54Of/yP3oiZXMI0xfUdjyA= github.com/aws/aws-sdk-go-v2/config v1.27.4 h1:AhfWb5ZwimdsYTgP7Od8E9L1u4sKmDW2ZVeLcf2O42M= github.com/aws/aws-sdk-go-v2/config v1.27.4/go.mod h1:zq2FFXK3A416kiukwpsd+rD4ny6JC7QSkp4QdN1Mp2g= +github.com/aws/aws-sdk-go-v2/config v1.27.27 h1:HdqgGt1OAP0HkEDDShEl0oSYa9ZZBSOmKpdpsDMdO90= +github.com/aws/aws-sdk-go-v2/config v1.27.27/go.mod h1:MVYamCg76dFNINkZFu4n4RjDixhVr51HLj4ErWzrVwg= github.com/aws/aws-sdk-go-v2/credentials v1.3.1/go.mod h1:r0n73xwsIVagq8RsxmZbGSRQFj9As3je72C2WzUIToc= github.com/aws/aws-sdk-go-v2/credentials v1.17.27 h1:2raNba6gr2IfA0eqqiP2XiQ0UVOpGPgDSi0I9iAP+UI= github.com/aws/aws-sdk-go-v2/credentials v1.17.27/go.mod h1:gniiwbGahQByxan6YjQUMcW4Aov6bLC3m+evgcoN4r4= @@ -695,6 +699,8 @@ github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.11/go.mod h1:SeSUYBLsMYFoRvH github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.3.2/go.mod h1:qaqQiHSrOUVOfKe6fhgQ6UzhxjwqVW8aHNegd6Ws4w4= github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.13.8 h1:wuOjvalpd2CnXffks74Vq6n3yv9vunKCoy4R1sjStGk= github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.13.8/go.mod h1:vywwjy6VnrR48Izg136JoSUXC4mH9QeUi3g0EH9DSrA= +github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.17.10 h1:zeN9UtUlA6FTx0vFSayxSX32HDw73Yb6Hh2izDSFxXY= +github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.17.10/go.mod h1:3HKuexPDcwLWPaqpW2UR/9n8N/u/3CKcGAzSs8p8u8g= github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.15 h1:SoNJ4RlFEQEbtDcCEt+QG56MY4fm4W8rYirAmq+/DdU= github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.15/go.mod h1:U9ke74k1n2bf+RIgoX1SXFed1HLs51OgUSs+Ph0KJP8= github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.15 h1:C6WHdGnTDIYETAm5iErQUiVNsclNx9qbJVPIt03B6bI= @@ -704,20 +710,28 @@ github.com/aws/aws-sdk-go-v2/internal/ini v1.8.0 h1:hT8rVHwugYE2lEfdFE0QWVo81lF7 github.com/aws/aws-sdk-go-v2/internal/ini v1.8.0/go.mod h1:8tu/lYfQfFe6IGnaOdrpVgEL2IrrDOf6/m9RQum4NkY= github.com/aws/aws-sdk-go-v2/internal/v4a v1.2.3 h1:lMwCXiWJlrtZot0NJTjbC8G9zl+V3i68gBTBBvDeEXA= github.com/aws/aws-sdk-go-v2/internal/v4a v1.2.3/go.mod h1:5yzAuE9i2RkVAttBl8yxZgQr5OCq4D5yDnG7j9x2L0U= +github.com/aws/aws-sdk-go-v2/internal/v4a v1.3.15 h1:Z5r7SycxmSllHYmaAZPpmN8GviDrSGhMS6bldqtXZPw= +github.com/aws/aws-sdk-go-v2/internal/v4a v1.3.15/go.mod h1:CetW7bDE00QoGEmPUoZuRog07SGVAUVW6LFpNP0YfIg= github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.2.1/go.mod h1:v33JQ57i2nekYTA70Mb+O18KeH4KqhdqxTJZNK1zdRE= github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.11.3 h1:dT3MqvGhSoaIhRseqw2I0yH81l7wiR2vjs57O51EAm8= github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.11.3/go.mod h1:GlAeCkHwugxdHaueRr4nhPuY+WW+gR8UjlcqzPr1SPI= github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.2.3 h1:xbwRyCy7kXrOj89iIKLB6NfE2WCpP9HoKyk8dMDvnIQ= github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.2.3/go.mod h1:R+/S1O4TYpcktbVwddeOYg+uwUfLhADP2S/x4QwsCTM= +github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.3.17 h1:YPYe6ZmvUfDDDELqEKtAd6bo8zxhkm+XEFEzQisqUIE= +github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.3.17/go.mod h1:oBtcnYua/CgzCWYN7NZ5j7PotFDaFSUjCYVTtfyn7vw= github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.2.1/go.mod h1:zceowr5Z1Nh2WVP8bf/3ikB41IZW59E4yIYbg+pC6mw= github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.11.17 h1:HGErhhrxZlQ044RiM+WdoZxp0p+EGM62y3L6pwA4olE= github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.11.17/go.mod h1:RkZEx4l0EHYDJpWppMJ3nD9wZJAa8/0lq9aVC+r2UII= github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.5.1/go.mod h1:6EQZIwNNvHpq/2/QSJnp4+ECvqIy55w95Ofs0ze+nGQ= github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.16.3 h1:KV0z2RDc7euMtg8aUT1czv5p29zcLlXALNFsd3jkkEc= github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.16.3/go.mod h1:KZgs2ny8HsxRIRbDwgvJcHHBZPOzQr/+NtGwnP+w2ec= +github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.17.15 h1:246A4lSTXWJw/rmlQI+TT2OcqeDMKBdyjEQrafMaQdA= +github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.17.15/go.mod h1:haVfg3761/WF7YPuJOER2MP0k4UAXyHaLclKXB6usDg= github.com/aws/aws-sdk-go-v2/service/s3 v1.11.1/go.mod h1:XLAGFrEjbvMCLvAtWLLP32yTv8GpBquCApZEycDLunI= github.com/aws/aws-sdk-go-v2/service/s3 v1.42.2 h1:NnduxUd9+Fq9DcCDdJK8v6l9lR1xDX4usvog+JuQAno= github.com/aws/aws-sdk-go-v2/service/s3 v1.42.2/go.mod h1:NXRKkiRF+erX2hnybnVU660cYT5/KChRD4iUgJ97cI8= +github.com/aws/aws-sdk-go-v2/service/s3 v1.58.3 h1:hT8ZAZRIfqBqHbzKTII+CIiY8G2oC9OpLedkZ51DWl8= +github.com/aws/aws-sdk-go-v2/service/s3 v1.58.3/go.mod h1:Lcxzg5rojyVPU/0eFwLtcyTaek/6Mtic5B1gJo7e/zE= github.com/aws/aws-sdk-go-v2/service/sso v1.3.1/go.mod h1:J3A3RGUvuCZjvSuZEcOpHDnzZP/sKbhDWV2T1EOzFIM= github.com/aws/aws-sdk-go-v2/service/sso v1.22.4 h1:BXx0ZIxvrJdSgSvKTZ+yRBeSqqgPM89VPlulEcl37tM= github.com/aws/aws-sdk-go-v2/service/sso v1.22.4/go.mod h1:ooyCOXjvJEsUw7x+ZDHeISPMhtwI3ZCB7ggFMcFfWLU= diff --git a/sdks/go/pkg/beam/io/filesystem/s3/s3.go b/sdks/go/pkg/beam/io/filesystem/s3/s3.go index 97a2c9aada14a..40fde0a300b28 100644 --- a/sdks/go/pkg/beam/io/filesystem/s3/s3.go +++ b/sdks/go/pkg/beam/io/filesystem/s3/s3.go @@ -149,7 +149,7 @@ func (f *fs) OpenWrite(ctx context.Context, filename string) (io.WriteCloser, er func (f *fs) Size(ctx context.Context, filename string) (int64, error) { bucket, key, err := parseURI(filename) if err != nil { - return -1, fmt.Errorf("error parsing S3 uri %s: %v", filename, err) + return -1, fmt.Errorf("error parsing S3 uri %s: %w", filename, err) } params := &s3.HeadObjectInput{ @@ -158,10 +158,14 @@ func (f *fs) Size(ctx context.Context, filename string) (int64, error) { } output, err := f.client.HeadObject(ctx, params) if err != nil { - return -1, fmt.Errorf("error getting metadata for object %s: %v", filename, err) + return -1, fmt.Errorf("error getting metadata for object %s: %w", filename, err) } - return output.ContentLength, err + if output.ContentLength != nil { + return *output.ContentLength, nil + } + + return -1, fmt.Errorf("content length for object %s was nil", filename) } // LastModified returns the time at which the file was last modified. From 679e9d799e5be7d906edd9e6d59aeefc5b755257 Mon Sep 17 00:00:00 2001 From: Robert Bradshaw Date: Thu, 8 Aug 2024 09:55:23 -0700 Subject: [PATCH 41/78] Upgrade Beam to use Cython 3. Many files, including all of those that would be likely to have issues with string/bytes, were already setting language level to 3. Taking a pass through the compiled files, I did not see any incompatibilities. Tests seem to be fine as well. --- sdks/python/container/base_image_requirements_manual.txt | 4 +--- sdks/python/pyproject.toml | 2 +- sdks/python/setup.py | 2 +- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/sdks/python/container/base_image_requirements_manual.txt b/sdks/python/container/base_image_requirements_manual.txt index 5bc60c474b4fe..2d99a55e564ea 100644 --- a/sdks/python/container/base_image_requirements_manual.txt +++ b/sdks/python/container/base_image_requirements_manual.txt @@ -25,9 +25,7 @@ # Consider constraining requirements of Beam itself when necessary. bs4 # Commonly used HTML processing tool. -# Don't upgrade to Cython 3.x, until it's released, stable and we have consensus -# to upgrade. Use 0.xx for now. -cython<1 +cython>=3,<4 # future is no longer a Beam dependency, but is an implicit dependency in # some versions of libraries that launch Beam pipelines, like tensorflow-transform. # Leaving 'future' in our containers for now prevent breaking tft users. diff --git a/sdks/python/pyproject.toml b/sdks/python/pyproject.toml index 32924a9297750..037e5a8aed6bb 100644 --- a/sdks/python/pyproject.toml +++ b/sdks/python/pyproject.toml @@ -28,7 +28,7 @@ requires = [ # Numpy headers "numpy>=1.14.3,<1.27", # Update setup.py as well. # having cython here will create wheels that are platform dependent. - "cython==0.29.36", + "cython>=3.0,<4", ## deps for generating external transform wrappers: # also update PyYaml bounds in sdks:python:generateExternalTransformsConfig 'pyyaml>=3.12,<7.0.0', diff --git a/sdks/python/setup.py b/sdks/python/setup.py index c9b2d087d04ca..5f631e3dfdab3 100644 --- a/sdks/python/setup.py +++ b/sdks/python/setup.py @@ -113,7 +113,7 @@ def get_version(): # `pipenv` package managers. pass -REQUIRED_CYTHON_VERSION = '0.28.1' +REQUIRED_CYTHON_VERSION = '3.0.0' try: _CYTHON_VERSION = distribution('cython').version if parse_version(_CYTHON_VERSION) < parse_version(REQUIRED_CYTHON_VERSION): From c825434965ee41f13b87ac37b99731c00c142bde Mon Sep 17 00:00:00 2001 From: Robert Bradshaw Date: Thu, 8 Aug 2024 10:19:52 -0700 Subject: [PATCH 42/78] Update base image requirements. Not running script due to pre-exising issues. --- sdks/python/container/py310/base_image_requirements.txt | 2 +- sdks/python/container/py311/base_image_requirements.txt | 2 +- sdks/python/container/py312/base_image_requirements.txt | 2 +- sdks/python/container/py38/base_image_requirements.txt | 2 +- sdks/python/container/py39/base_image_requirements.txt | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/sdks/python/container/py310/base_image_requirements.txt b/sdks/python/container/py310/base_image_requirements.txt index e2806270e5443..35eea227888bf 100644 --- a/sdks/python/container/py310/base_image_requirements.txt +++ b/sdks/python/container/py310/base_image_requirements.txt @@ -36,7 +36,7 @@ cloudpickle==2.2.1 cramjam==2.8.3 crcmod==1.7 cryptography==42.0.5 -Cython==0.29.37 +Cython==3.0.10 Deprecated==1.2.14 deprecation==2.1.0 dill==0.3.1.1 diff --git a/sdks/python/container/py311/base_image_requirements.txt b/sdks/python/container/py311/base_image_requirements.txt index 1cba006ad32cc..d6d523689fa77 100644 --- a/sdks/python/container/py311/base_image_requirements.txt +++ b/sdks/python/container/py311/base_image_requirements.txt @@ -35,7 +35,7 @@ cloudpickle==2.2.1 cramjam==2.8.3 crcmod==1.7 cryptography==42.0.5 -Cython==0.29.37 +Cython==3.0.10 Deprecated==1.2.14 deprecation==2.1.0 dill==0.3.1.1 diff --git a/sdks/python/container/py312/base_image_requirements.txt b/sdks/python/container/py312/base_image_requirements.txt index 4a6147b573b85..b7a5f3687166b 100644 --- a/sdks/python/container/py312/base_image_requirements.txt +++ b/sdks/python/container/py312/base_image_requirements.txt @@ -35,7 +35,7 @@ cloudpickle==2.2.1 cramjam==2.8.3 crcmod==1.7 cryptography==42.0.7 -Cython==0.29.37 +Cython==3.0.10 deprecation==2.1.0 dill==0.3.1.1 dnspython==2.6.1 diff --git a/sdks/python/container/py38/base_image_requirements.txt b/sdks/python/container/py38/base_image_requirements.txt index f88dba103469b..c92761473362b 100644 --- a/sdks/python/container/py38/base_image_requirements.txt +++ b/sdks/python/container/py38/base_image_requirements.txt @@ -37,7 +37,7 @@ cloudpickle==2.2.1 cramjam==2.8.3 crcmod==1.7 cryptography==42.0.5 -Cython==0.29.37 +Cython==3.0.10 Deprecated==1.2.14 deprecation==2.1.0 dill==0.3.1.1 diff --git a/sdks/python/container/py39/base_image_requirements.txt b/sdks/python/container/py39/base_image_requirements.txt index 39b888dd6ab75..bab94181499c4 100644 --- a/sdks/python/container/py39/base_image_requirements.txt +++ b/sdks/python/container/py39/base_image_requirements.txt @@ -36,7 +36,7 @@ cloudpickle==2.2.1 cramjam==2.8.3 crcmod==1.7 cryptography==42.0.5 -Cython==0.29.37 +Cython==3.0.10 Deprecated==1.2.14 deprecation==2.1.0 dill==0.3.1.1 From 2b7e84239eab55347346859bbb78741684a5a6bf Mon Sep 17 00:00:00 2001 From: Robert Bradshaw Date: Thu, 8 Aug 2024 10:20:22 -0700 Subject: [PATCH 43/78] Remove now unneeded langauge level specifications. --- sdks/python/apache_beam/coders/coder_impl.py | 2 -- sdks/python/apache_beam/coders/stream.pyx | 2 -- sdks/python/apache_beam/metrics/cells.py | 2 -- sdks/python/apache_beam/metrics/execution.py | 2 -- sdks/python/apache_beam/metrics/monitoring_infos.py | 2 -- sdks/python/apache_beam/runners/common.py | 1 - sdks/python/apache_beam/runners/worker/logger.py | 2 -- sdks/python/apache_beam/runners/worker/opcounters.py | 2 -- sdks/python/apache_beam/runners/worker/operations.py | 2 -- sdks/python/apache_beam/runners/worker/statesampler_fast.pyx | 2 -- sdks/python/apache_beam/testing/fast_test_utils.pyx | 2 -- sdks/python/apache_beam/transforms/cy_combiners.py | 2 -- .../apache_beam/transforms/cy_dataflow_distribution_counter.pyx | 2 -- sdks/python/apache_beam/transforms/stats.py | 2 -- sdks/python/apache_beam/utils/counters.py | 1 - sdks/python/apache_beam/utils/windowed_value.py | 2 -- 16 files changed, 30 deletions(-) diff --git a/sdks/python/apache_beam/coders/coder_impl.py b/sdks/python/apache_beam/coders/coder_impl.py index e44c2535156e5..ff5fb5bef7ac9 100644 --- a/sdks/python/apache_beam/coders/coder_impl.py +++ b/sdks/python/apache_beam/coders/coder_impl.py @@ -15,8 +15,6 @@ # limitations under the License. # -# cython: language_level=3 - """Coder implementations. The actual encode/decode implementations are split off from coders to diff --git a/sdks/python/apache_beam/coders/stream.pyx b/sdks/python/apache_beam/coders/stream.pyx index 8f941c151bde7..3977660f68b06 100644 --- a/sdks/python/apache_beam/coders/stream.pyx +++ b/sdks/python/apache_beam/coders/stream.pyx @@ -15,8 +15,6 @@ # limitations under the License. # -# cython: language_level=3 - """Compiled version of the Stream objects used by CoderImpl. For internal use only; no backwards-compatibility guarantees. diff --git a/sdks/python/apache_beam/metrics/cells.py b/sdks/python/apache_beam/metrics/cells.py index 53b6fc8495920..3bfbfc6b2e773 100644 --- a/sdks/python/apache_beam/metrics/cells.py +++ b/sdks/python/apache_beam/metrics/cells.py @@ -15,8 +15,6 @@ # limitations under the License. # -# cython: language_level=3 - """ This file contains metric cell classes. A metric cell is used to accumulate in-memory changes to a metric. It represents a specific metric in a single diff --git a/sdks/python/apache_beam/metrics/execution.py b/sdks/python/apache_beam/metrics/execution.py index 4202f7996c7fd..570062371cae6 100644 --- a/sdks/python/apache_beam/metrics/execution.py +++ b/sdks/python/apache_beam/metrics/execution.py @@ -15,8 +15,6 @@ # limitations under the License. # -# cython: language_level=3 - """ This module is for internal use only; no backwards-compatibility guarantees. diff --git a/sdks/python/apache_beam/metrics/monitoring_infos.py b/sdks/python/apache_beam/metrics/monitoring_infos.py index 7bc7cced280c1..0e638c9eb4fe7 100644 --- a/sdks/python/apache_beam/metrics/monitoring_infos.py +++ b/sdks/python/apache_beam/metrics/monitoring_infos.py @@ -15,8 +15,6 @@ # limitations under the License. # -# cython: language_level=3 - # pytype: skip-file import collections diff --git a/sdks/python/apache_beam/runners/common.py b/sdks/python/apache_beam/runners/common.py index 40a3341e2b4c3..ba4dd98c7a937 100644 --- a/sdks/python/apache_beam/runners/common.py +++ b/sdks/python/apache_beam/runners/common.py @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # -# cython: language_level=3 """Worker operations executor. diff --git a/sdks/python/apache_beam/runners/worker/logger.py b/sdks/python/apache_beam/runners/worker/logger.py index 1efebeb3c78c0..06e2508fb7d29 100644 --- a/sdks/python/apache_beam/runners/worker/logger.py +++ b/sdks/python/apache_beam/runners/worker/logger.py @@ -15,8 +15,6 @@ # limitations under the License. # -# cython: language_level=3 - """Python worker logging.""" # pytype: skip-file diff --git a/sdks/python/apache_beam/runners/worker/opcounters.py b/sdks/python/apache_beam/runners/worker/opcounters.py index ba53cbcbce7f5..51ca4cf0545b7 100644 --- a/sdks/python/apache_beam/runners/worker/opcounters.py +++ b/sdks/python/apache_beam/runners/worker/opcounters.py @@ -15,8 +15,6 @@ # limitations under the License. # -# cython: language_level=3 - """Counters collect the progress of the Worker for reporting to the service.""" # pytype: skip-file diff --git a/sdks/python/apache_beam/runners/worker/operations.py b/sdks/python/apache_beam/runners/worker/operations.py index 00a652c49e669..58c807c28dbd4 100644 --- a/sdks/python/apache_beam/runners/worker/operations.py +++ b/sdks/python/apache_beam/runners/worker/operations.py @@ -15,8 +15,6 @@ # limitations under the License. # -# cython: language_level=3 - """Worker operations executor.""" # pytype: skip-file diff --git a/sdks/python/apache_beam/runners/worker/statesampler_fast.pyx b/sdks/python/apache_beam/runners/worker/statesampler_fast.pyx index 7c082b7a62269..d02d05c3af942 100644 --- a/sdks/python/apache_beam/runners/worker/statesampler_fast.pyx +++ b/sdks/python/apache_beam/runners/worker/statesampler_fast.pyx @@ -15,8 +15,6 @@ # limitations under the License. # -# cython: language_level=3 - """State sampler for tracking time spent in execution steps. The state sampler profiles the time spent in each step of a pipeline. diff --git a/sdks/python/apache_beam/testing/fast_test_utils.pyx b/sdks/python/apache_beam/testing/fast_test_utils.pyx index d78a5b773186f..d815bcfe8f966 100644 --- a/sdks/python/apache_beam/testing/fast_test_utils.pyx +++ b/sdks/python/apache_beam/testing/fast_test_utils.pyx @@ -15,8 +15,6 @@ # limitations under the License. # -# cython: language_level=3 - cimport libc.stdint cimport libc.stdlib cimport libc.string diff --git a/sdks/python/apache_beam/transforms/cy_combiners.py b/sdks/python/apache_beam/transforms/cy_combiners.py index 2267d02c1908c..b5cc7493a29a3 100644 --- a/sdks/python/apache_beam/transforms/cy_combiners.py +++ b/sdks/python/apache_beam/transforms/cy_combiners.py @@ -15,8 +15,6 @@ # limitations under the License. # -# cython: language_level=3 - """A library of basic cythonized CombineFn subclasses. For internal use only; no backwards-compatibility guarantees. diff --git a/sdks/python/apache_beam/transforms/cy_dataflow_distribution_counter.pyx b/sdks/python/apache_beam/transforms/cy_dataflow_distribution_counter.pyx index c1b32356ed1ff..c117ae32f4e5f 100644 --- a/sdks/python/apache_beam/transforms/cy_dataflow_distribution_counter.pyx +++ b/sdks/python/apache_beam/transforms/cy_dataflow_distribution_counter.pyx @@ -14,8 +14,6 @@ # limitations under the License. # -# cython: language_level=3 - """ For internal use only. No backwards compatibility guarantees.""" cimport cython diff --git a/sdks/python/apache_beam/transforms/stats.py b/sdks/python/apache_beam/transforms/stats.py index 2599760f8d5be..d389463e55a26 100644 --- a/sdks/python/apache_beam/transforms/stats.py +++ b/sdks/python/apache_beam/transforms/stats.py @@ -15,8 +15,6 @@ # limitations under the License. # -# cython: language_level=3 - """This module has all statistic related transforms. This ApproximateUnique class will be deprecated [1]. PLease look into using diff --git a/sdks/python/apache_beam/utils/counters.py b/sdks/python/apache_beam/utils/counters.py index 214fa433de1c4..57d73fa283ebf 100644 --- a/sdks/python/apache_beam/utils/counters.py +++ b/sdks/python/apache_beam/utils/counters.py @@ -17,7 +17,6 @@ # cython: profile=False # cython: overflowcheck=True -# cython: language_level=3 """Counters collect the progress of the Worker for reporting to the service. diff --git a/sdks/python/apache_beam/utils/windowed_value.py b/sdks/python/apache_beam/utils/windowed_value.py index fb15d3778a6a0..f6232ce2f6b0d 100644 --- a/sdks/python/apache_beam/utils/windowed_value.py +++ b/sdks/python/apache_beam/utils/windowed_value.py @@ -22,8 +22,6 @@ # editing this file as WindowedValues are created for every element for # every step in a Beam pipeline. -# cython: language_level=3 - # pytype: skip-file import collections From 1de0c4670eefc02a35c3ffa7d1e25bf7f69744ce Mon Sep 17 00:00:00 2001 From: Robert Bradshaw Date: Thu, 8 Aug 2024 15:11:15 -0700 Subject: [PATCH 44/78] Add no-except to time-critical function. --- sdks/python/apache_beam/runners/worker/statesampler_fast.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/python/apache_beam/runners/worker/statesampler_fast.pyx b/sdks/python/apache_beam/runners/worker/statesampler_fast.pyx index d02d05c3af942..45700a0b0f81e 100644 --- a/sdks/python/apache_beam/runners/worker/statesampler_fast.pyx +++ b/sdks/python/apache_beam/runners/worker/statesampler_fast.pyx @@ -57,7 +57,7 @@ cdef extern from "crossplatform_time.h" nogil: long tv_nsec # nanoseconds int clock_gettime(int clock_id, timespec *result) -cdef inline int64_t get_nsec_time() nogil: +cdef inline int64_t get_nsec_time() noexcept nogil: """Get current time as microseconds since Unix epoch.""" cdef timespec current_time # First argument value of 0 corresponds to CLOCK_REALTIME. From 6bdf63a9d2deb00d6b33704c8a8416cf6783f4b6 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 8 Aug 2024 22:44:50 -0700 Subject: [PATCH 45/78] Bump cloud.google.com/go/spanner from 1.64.0 to 1.66.0 in /sdks (#32126) Bumps [cloud.google.com/go/spanner](https://github.com/googleapis/google-cloud-go) from 1.64.0 to 1.66.0. - [Release notes](https://github.com/googleapis/google-cloud-go/releases) - [Changelog](https://github.com/googleapis/google-cloud-go/blob/main/CHANGES.md) - [Commits](https://github.com/googleapis/google-cloud-go/compare/spanner/v1.64.0...spanner/v1.66.0) --- updated-dependencies: - dependency-name: cloud.google.com/go/spanner dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- sdks/go.mod | 2 +- sdks/go.sum | 18 ++---------------- 2 files changed, 3 insertions(+), 17 deletions(-) diff --git a/sdks/go.mod b/sdks/go.mod index bd25b3beab5aa..1f53410c5e042 100644 --- a/sdks/go.mod +++ b/sdks/go.mod @@ -28,7 +28,7 @@ require ( cloud.google.com/go/datastore v1.17.1 cloud.google.com/go/profiler v0.4.0 cloud.google.com/go/pubsub v1.40.0 - cloud.google.com/go/spanner v1.64.0 + cloud.google.com/go/spanner v1.66.0 cloud.google.com/go/storage v1.43.0 github.com/aws/aws-sdk-go-v2 v1.30.3 github.com/aws/aws-sdk-go-v2/config v1.27.27 diff --git a/sdks/go.sum b/sdks/go.sum index a50c8ce9230c9..8a5072a744094 100644 --- a/sdks/go.sum +++ b/sdks/go.sum @@ -540,8 +540,8 @@ cloud.google.com/go/shell v1.6.0/go.mod h1:oHO8QACS90luWgxP3N9iZVuEiSF84zNyLytb+ cloud.google.com/go/spanner v1.41.0/go.mod h1:MLYDBJR/dY4Wt7ZaMIQ7rXOTLjYrmxLE/5ve9vFfWos= cloud.google.com/go/spanner v1.44.0/go.mod h1:G8XIgYdOK+Fbcpbs7p2fiprDw4CaZX63whnSMLVBxjk= cloud.google.com/go/spanner v1.45.0/go.mod h1:FIws5LowYz8YAE1J8fOS7DJup8ff7xJeetWEo5REA2M= -cloud.google.com/go/spanner v1.64.0 h1:ltyPbHA/nRAtAhU/o742dXBCI1eNHPeaRY09Ja8B+hM= -cloud.google.com/go/spanner v1.64.0/go.mod h1:TOFx3pb2UwPsDGlE1gTehW+y6YlU4IFk+VdDHSGQS/M= +cloud.google.com/go/spanner v1.66.0 h1:PF1upR8n+DVUO9mUpCc1j5kyHn1Xfq0A53ZrnM0AmeU= +cloud.google.com/go/spanner v1.66.0/go.mod h1:gu+weqqrnoBsVlxOmMG5pzDZ2nkpqqJx4MsnmIacH5w= cloud.google.com/go/speech v1.6.0/go.mod h1:79tcr4FHCimOp56lwC01xnt/WPJZc4v3gzyT7FoBkCM= cloud.google.com/go/speech v1.7.0/go.mod h1:KptqL+BAQIhMsj1kOP2la5DSEEerPDuOP/2mmkhHhZQ= cloud.google.com/go/speech v1.8.0/go.mod h1:9bYIl1/tjsAnMgKGHKmBZzXKEkGgtU+MpdDPTE9f7y0= @@ -681,13 +681,9 @@ github.com/aws/aws-sdk-go v1.34.0/go.mod h1:5zCpMtNQVjRREroY7sYe8lOMRSxkhG6MZveU github.com/aws/aws-sdk-go-v2 v1.7.1/go.mod h1:L5LuPC1ZgDr2xQS7AmIec/Jlc7O/Y1u2KxJyNVab250= github.com/aws/aws-sdk-go-v2 v1.30.3 h1:jUeBtG0Ih+ZIFH0F4UkmL9w3cSpaMv9tYYDbzILP8dY= github.com/aws/aws-sdk-go-v2 v1.30.3/go.mod h1:nIQjQVp5sfpQcTc9mPSr1B0PaWK5ByX9MOoDadSN4lc= -github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.5.1 h1:ZY3108YtBNq96jNZTICHxN1gSBSbnvIdYwwqnvCV4Mc= -github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.5.1/go.mod h1:t8PYl/6LzdAqsU4/9tz28V/kU+asFePvpOMkdul0gEQ= github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.3 h1:tW1/Rkad38LA15X4UQtjXZXNKsCgkshC3EbmcUmghTg= github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.3/go.mod h1:UbnqO+zjqk3uIt9yCACHJ9IVNhyhOCnYk8yA19SAWrM= github.com/aws/aws-sdk-go-v2/config v1.5.0/go.mod h1:RWlPOAW3E3tbtNAqTwvSW54Of/yP3oiZXMI0xfUdjyA= -github.com/aws/aws-sdk-go-v2/config v1.27.4 h1:AhfWb5ZwimdsYTgP7Od8E9L1u4sKmDW2ZVeLcf2O42M= -github.com/aws/aws-sdk-go-v2/config v1.27.4/go.mod h1:zq2FFXK3A416kiukwpsd+rD4ny6JC7QSkp4QdN1Mp2g= github.com/aws/aws-sdk-go-v2/config v1.27.27 h1:HdqgGt1OAP0HkEDDShEl0oSYa9ZZBSOmKpdpsDMdO90= github.com/aws/aws-sdk-go-v2/config v1.27.27/go.mod h1:MVYamCg76dFNINkZFu4n4RjDixhVr51HLj4ErWzrVwg= github.com/aws/aws-sdk-go-v2/credentials v1.3.1/go.mod h1:r0n73xwsIVagq8RsxmZbGSRQFj9As3je72C2WzUIToc= @@ -697,8 +693,6 @@ github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.3.0/go.mod h1:2LAuqPx1I6jNfaGDu github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.11 h1:KreluoV8FZDEtI6Co2xuNk/UqI9iwMrOx/87PBNIKqw= github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.11/go.mod h1:SeSUYBLsMYFoRvHE0Tjvn7kbxaUhl75CJi1sbfhMxkU= github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.3.2/go.mod h1:qaqQiHSrOUVOfKe6fhgQ6UzhxjwqVW8aHNegd6Ws4w4= -github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.13.8 h1:wuOjvalpd2CnXffks74Vq6n3yv9vunKCoy4R1sjStGk= -github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.13.8/go.mod h1:vywwjy6VnrR48Izg136JoSUXC4mH9QeUi3g0EH9DSrA= github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.17.10 h1:zeN9UtUlA6FTx0vFSayxSX32HDw73Yb6Hh2izDSFxXY= github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.17.10/go.mod h1:3HKuexPDcwLWPaqpW2UR/9n8N/u/3CKcGAzSs8p8u8g= github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.15 h1:SoNJ4RlFEQEbtDcCEt+QG56MY4fm4W8rYirAmq+/DdU= @@ -708,28 +702,20 @@ github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.15/go.mod h1:ZQLZqhcu+Jh github.com/aws/aws-sdk-go-v2/internal/ini v1.1.1/go.mod h1:Zy8smImhTdOETZqfyn01iNOe0CNggVbPjCajyaz6Gvg= github.com/aws/aws-sdk-go-v2/internal/ini v1.8.0 h1:hT8rVHwugYE2lEfdFE0QWVo81lF7jMrYJVDWI+f+VxU= github.com/aws/aws-sdk-go-v2/internal/ini v1.8.0/go.mod h1:8tu/lYfQfFe6IGnaOdrpVgEL2IrrDOf6/m9RQum4NkY= -github.com/aws/aws-sdk-go-v2/internal/v4a v1.2.3 h1:lMwCXiWJlrtZot0NJTjbC8G9zl+V3i68gBTBBvDeEXA= -github.com/aws/aws-sdk-go-v2/internal/v4a v1.2.3/go.mod h1:5yzAuE9i2RkVAttBl8yxZgQr5OCq4D5yDnG7j9x2L0U= github.com/aws/aws-sdk-go-v2/internal/v4a v1.3.15 h1:Z5r7SycxmSllHYmaAZPpmN8GviDrSGhMS6bldqtXZPw= github.com/aws/aws-sdk-go-v2/internal/v4a v1.3.15/go.mod h1:CetW7bDE00QoGEmPUoZuRog07SGVAUVW6LFpNP0YfIg= github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.2.1/go.mod h1:v33JQ57i2nekYTA70Mb+O18KeH4KqhdqxTJZNK1zdRE= github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.11.3 h1:dT3MqvGhSoaIhRseqw2I0yH81l7wiR2vjs57O51EAm8= github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.11.3/go.mod h1:GlAeCkHwugxdHaueRr4nhPuY+WW+gR8UjlcqzPr1SPI= -github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.2.3 h1:xbwRyCy7kXrOj89iIKLB6NfE2WCpP9HoKyk8dMDvnIQ= -github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.2.3/go.mod h1:R+/S1O4TYpcktbVwddeOYg+uwUfLhADP2S/x4QwsCTM= github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.3.17 h1:YPYe6ZmvUfDDDELqEKtAd6bo8zxhkm+XEFEzQisqUIE= github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.3.17/go.mod h1:oBtcnYua/CgzCWYN7NZ5j7PotFDaFSUjCYVTtfyn7vw= github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.2.1/go.mod h1:zceowr5Z1Nh2WVP8bf/3ikB41IZW59E4yIYbg+pC6mw= github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.11.17 h1:HGErhhrxZlQ044RiM+WdoZxp0p+EGM62y3L6pwA4olE= github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.11.17/go.mod h1:RkZEx4l0EHYDJpWppMJ3nD9wZJAa8/0lq9aVC+r2UII= github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.5.1/go.mod h1:6EQZIwNNvHpq/2/QSJnp4+ECvqIy55w95Ofs0ze+nGQ= -github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.16.3 h1:KV0z2RDc7euMtg8aUT1czv5p29zcLlXALNFsd3jkkEc= -github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.16.3/go.mod h1:KZgs2ny8HsxRIRbDwgvJcHHBZPOzQr/+NtGwnP+w2ec= github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.17.15 h1:246A4lSTXWJw/rmlQI+TT2OcqeDMKBdyjEQrafMaQdA= github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.17.15/go.mod h1:haVfg3761/WF7YPuJOER2MP0k4UAXyHaLclKXB6usDg= github.com/aws/aws-sdk-go-v2/service/s3 v1.11.1/go.mod h1:XLAGFrEjbvMCLvAtWLLP32yTv8GpBquCApZEycDLunI= -github.com/aws/aws-sdk-go-v2/service/s3 v1.42.2 h1:NnduxUd9+Fq9DcCDdJK8v6l9lR1xDX4usvog+JuQAno= -github.com/aws/aws-sdk-go-v2/service/s3 v1.42.2/go.mod h1:NXRKkiRF+erX2hnybnVU660cYT5/KChRD4iUgJ97cI8= github.com/aws/aws-sdk-go-v2/service/s3 v1.58.3 h1:hT8ZAZRIfqBqHbzKTII+CIiY8G2oC9OpLedkZ51DWl8= github.com/aws/aws-sdk-go-v2/service/s3 v1.58.3/go.mod h1:Lcxzg5rojyVPU/0eFwLtcyTaek/6Mtic5B1gJo7e/zE= github.com/aws/aws-sdk-go-v2/service/sso v1.3.1/go.mod h1:J3A3RGUvuCZjvSuZEcOpHDnzZP/sKbhDWV2T1EOzFIM= From af6bf8a1423eef082a289e9a1cfcfe0bf2d8c0ed Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 8 Aug 2024 23:11:22 -0700 Subject: [PATCH 46/78] Bump golang.org/x/net from 0.27.0 to 0.28.0 in /sdks (#32128) Bumps [golang.org/x/net](https://github.com/golang/net) from 0.27.0 to 0.28.0. - [Commits](https://github.com/golang/net/compare/v0.27.0...v0.28.0) --- updated-dependencies: - dependency-name: golang.org/x/net dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- sdks/go.mod | 6 +++--- sdks/go.sum | 16 ++++++++-------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/sdks/go.mod b/sdks/go.mod index 1f53410c5e042..8849f9732afd3 100644 --- a/sdks/go.mod +++ b/sdks/go.mod @@ -53,10 +53,10 @@ require ( github.com/xitongsys/parquet-go v1.6.2 github.com/xitongsys/parquet-go-source v0.0.0-20220315005136-aec0fe3e777c go.mongodb.org/mongo-driver v1.16.1 - golang.org/x/net v0.27.0 + golang.org/x/net v0.28.0 golang.org/x/oauth2 v0.21.0 golang.org/x/sync v0.8.0 - golang.org/x/sys v0.22.0 + golang.org/x/sys v0.23.0 golang.org/x/text v0.17.0 google.golang.org/api v0.189.0 google.golang.org/genproto v0.0.0-20240725223205-93522f1f2a9f @@ -183,7 +183,7 @@ require ( github.com/youmark/pkcs8 v0.0.0-20181117223130-1be2e3e5546d // indirect github.com/zeebo/xxh3 v1.0.2 // indirect go.opencensus.io v0.24.0 // indirect - golang.org/x/crypto v0.25.0 // indirect + golang.org/x/crypto v0.26.0 // indirect golang.org/x/mod v0.18.0 // indirect golang.org/x/tools v0.22.0 // indirect golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 // indirect diff --git a/sdks/go.sum b/sdks/go.sum index 8a5072a744094..90fb958b1df68 100644 --- a/sdks/go.sum +++ b/sdks/go.sum @@ -1255,8 +1255,8 @@ golang.org/x/crypto v0.0.0-20210421170649-83a5a9bb288b/go.mod h1:T9bdIzuCu7OtxOm golang.org/x/crypto v0.0.0-20210513164829-c07d793c2f9a/go.mod h1:P+XmwS30IXTQdn5tA2iutPOUgjI07+tq3H3K9MVA1s8= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.0.0-20211108221036-ceb1ce70b4fa/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= -golang.org/x/crypto v0.25.0 h1:ypSNr+bnYL2YhwoMt2zPxHFmbAN1KZs/njMG3hxUp30= -golang.org/x/crypto v0.25.0/go.mod h1:T+wALwcMOSE0kXgUAnPAHqTLW+XHgcELELW8VaDgm/M= +golang.org/x/crypto v0.26.0 h1:RrRspgV4mU+YwB4FYnuBoKsUapNIL5cohGAmSH3azsw= +golang.org/x/crypto v0.26.0/go.mod h1:GY7jblb9wI+FOo5y8/S2oY4zWP07AkOJ4+jxCqdqn54= golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= @@ -1377,8 +1377,8 @@ golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc= golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= -golang.org/x/net v0.27.0 h1:5K3Njcw06/l2y9vpGCSdcxWOYHOUk3dVNGDXN+FvAys= -golang.org/x/net v0.27.0/go.mod h1:dDi0PyhWNoiUOrAS8uXv/vnScO4wnHQO4mj9fn/RytE= +golang.org/x/net v0.28.0 h1:a9JDOJc5GMUJ0+UDqmLT86WiEy7iWyIhz8gz8E4e5hE= +golang.org/x/net v0.28.0/go.mod h1:yqtgsTWOOnlGLG9GFRrK3++bGOUEkNBoHZc8MEDWPNg= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= @@ -1515,8 +1515,8 @@ golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/sys v0.22.0 h1:RI27ohtqKCnwULzJLqkv897zojh5/DwS/ENaMzUOaWI= -golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.23.0 h1:YfKFowiIMvtgl1UERQoTPPToxltDeZfbj4H7dVUCwmM= +golang.org/x/sys v0.23.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.2.0/go.mod h1:TVmDHMZPmdnySmBfhjOoOdhjzdE1h4u1VwSiw2l1Nuc= @@ -1525,8 +1525,8 @@ golang.org/x/term v0.4.0/go.mod h1:9P2UbLfCdcvo3p/nzKvsmas4TnlujnuoV9hGgYzW1lQ= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= golang.org/x/term v0.6.0/go.mod h1:m6U89DPEgQRMq3DNkDClhWw02AUbt2daBVO4cn4Hv9U= golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY= -golang.org/x/term v0.22.0 h1:BbsgPEJULsl2fV/AT3v15Mjva5yXKQDyKf+TbDz7QJk= -golang.org/x/term v0.22.0/go.mod h1:F3qCibpT5AMpCRfhfT53vVJwhLtIVHhB9XDjfFvnMI4= +golang.org/x/term v0.23.0 h1:F6D4vR+EHoL9/sWAWgAR1H2DcHr4PareCbAaCo1RpuU= +golang.org/x/term v0.23.0/go.mod h1:DgV24QBUrK6jhZXl+20l6UWznPlwAHm1Q1mGHtydmSk= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= From 27741fb6c9b6fc109a94bb91fa4a665473f50268 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 9 Aug 2024 05:11:41 -0700 Subject: [PATCH 47/78] Bump cloud.google.com/go/profiler from 0.4.0 to 0.4.1 in /sdks (#32125) Bumps [cloud.google.com/go/profiler](https://github.com/googleapis/google-cloud-go) from 0.4.0 to 0.4.1. - [Release notes](https://github.com/googleapis/google-cloud-go/releases) - [Changelog](https://github.com/googleapis/google-cloud-go/blob/main/CHANGES.md) - [Commits](https://github.com/googleapis/google-cloud-go/compare/v0.4.0...ai/v0.4.1) --- updated-dependencies: - dependency-name: cloud.google.com/go/profiler dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- sdks/go.mod | 4 ++-- sdks/go.sum | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/sdks/go.mod b/sdks/go.mod index 8849f9732afd3..8c7a52ca951c7 100644 --- a/sdks/go.mod +++ b/sdks/go.mod @@ -26,7 +26,7 @@ require ( cloud.google.com/go/bigquery v1.62.0 cloud.google.com/go/bigtable v1.28.0 cloud.google.com/go/datastore v1.17.1 - cloud.google.com/go/profiler v0.4.0 + cloud.google.com/go/profiler v0.4.1 cloud.google.com/go/pubsub v1.40.0 cloud.google.com/go/spanner v1.66.0 cloud.google.com/go/storage v1.43.0 @@ -152,7 +152,7 @@ require ( github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect github.com/golang/snappy v0.0.4 // indirect github.com/google/flatbuffers v23.5.26+incompatible // indirect - github.com/google/pprof v0.0.0-20230602150820-91b7bce49751 // indirect + github.com/google/pprof v0.0.0-20240528025155-186aa0362fba // indirect github.com/google/renameio/v2 v2.0.0 // indirect github.com/google/s2a-go v0.1.7 // indirect github.com/googleapis/enterprise-certificate-proxy v0.3.2 // indirect diff --git a/sdks/go.sum b/sdks/go.sum index 90fb958b1df68..e0039f28e43ba 100644 --- a/sdks/go.sum +++ b/sdks/go.sum @@ -439,8 +439,8 @@ cloud.google.com/go/privatecatalog v0.5.0/go.mod h1:XgosMUvvPyxDjAVNDYxJ7wBW8//h cloud.google.com/go/privatecatalog v0.6.0/go.mod h1:i/fbkZR0hLN29eEWiiwue8Pb+GforiEIBnV9yrRUOKI= cloud.google.com/go/privatecatalog v0.7.0/go.mod h1:2s5ssIFO69F5csTXcwBP7NPFTZvps26xGzvQ2PQaBYg= cloud.google.com/go/privatecatalog v0.8.0/go.mod h1:nQ6pfaegeDAq/Q5lrfCQzQLhubPiZhSaNhIgfJlnIXs= -cloud.google.com/go/profiler v0.4.0 h1:ZeRDZbsOBDyRG0OiK0Op1/XWZ3xeLwJc9zjkzczUxyY= -cloud.google.com/go/profiler v0.4.0/go.mod h1:RvPlm4dilIr3oJtAOeFQU9Lrt5RoySHSDj4pTd6TWeU= +cloud.google.com/go/profiler v0.4.1 h1:Q7+lOvikTGMJ/IAWocpYYGit4SIIoILmVZfEEWTORSY= +cloud.google.com/go/profiler v0.4.1/go.mod h1:LBrtEX6nbvhv1w/e5CPZmX9ajGG9BGLtGbv56Tg4SHs= cloud.google.com/go/pubsub v1.0.1/go.mod h1:R0Gpsv3s54REJCy4fxDixWD93lHJMoZTyQ2kNxGRt3I= cloud.google.com/go/pubsub v1.1.0/go.mod h1:EwwdRX2sKPjnvnqCa270oGRyludottCI76h+R3AArQw= cloud.google.com/go/pubsub v1.2.0/go.mod h1:jhfEVHT8odbXTkndysNHCcx0awwzvfOlguIAii9o8iA= @@ -938,8 +938,8 @@ github.com/google/pprof v0.0.0-20210226084205-cbba55b83ad5/go.mod h1:kpwsk12EmLe github.com/google/pprof v0.0.0-20210601050228-01bbb1931b22/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= github.com/google/pprof v0.0.0-20210609004039-a478d1d731e9/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= -github.com/google/pprof v0.0.0-20230602150820-91b7bce49751 h1:hR7/MlvK23p6+lIw9SN1TigNLn9ZnF3W4SYRKq2gAHs= -github.com/google/pprof v0.0.0-20230602150820-91b7bce49751/go.mod h1:Jh3hGz2jkYak8qXPD19ryItVnUgpgeqzdkY/D0EaeuA= +github.com/google/pprof v0.0.0-20240528025155-186aa0362fba h1:ql1qNgCyOB7iAEk8JTNM+zJrgIbnyCKX/wdlyPufP5g= +github.com/google/pprof v0.0.0-20240528025155-186aa0362fba/go.mod h1:K1liHPHnj73Fdn/EKuT8nrFqBihUSKXoLYU0BuatOYo= github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI= github.com/google/renameio/v2 v2.0.0 h1:UifI23ZTGY8Tt29JbYFiuyIU3eX+RNFtUwefq9qAhxg= github.com/google/renameio/v2 v2.0.0/go.mod h1:BtmJXm5YlszgC+TD4HOEEUFgkJP3nLxehU6hfe7jRt4= From 6d96ae2580d61498cc6ee04c25777ebe5e79de32 Mon Sep 17 00:00:00 2001 From: Yi Hu Date: Fri, 9 Aug 2024 09:27:09 -0400 Subject: [PATCH 48/78] Fix Lineage name breaking change (#32122) --- .../src/main/java/org/apache/beam/sdk/metrics/Lineage.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/metrics/Lineage.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/metrics/Lineage.java index 8b69b0ef55236..6166a562bf2df 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/metrics/Lineage.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/metrics/Lineage.java @@ -55,8 +55,8 @@ public static Set query(MetricResults results, Type type) { /** Lineage metrics resource types. */ public enum Type { - SOURCE("source"), - SINK("sink"); + SOURCE("sources"), + SINK("sinks"); private final String name; From 01100a3b2fe9af9f30192f31cefe4e8d180dc782 Mon Sep 17 00:00:00 2001 From: Danny McCormick Date: Fri, 9 Aug 2024 16:01:40 +0200 Subject: [PATCH 49/78] Generate python dependencies (#32132) --- .../license_scripts/dep_urls_py.yaml | 2 + .../py310/base_image_requirements.txt | 135 ++++++++--------- .../py311/base_image_requirements.txt | 133 ++++++++--------- .../py312/base_image_requirements.txt | 122 +++++++-------- .../py38/base_image_requirements.txt | 133 ++++++++--------- .../py39/base_image_requirements.txt | 139 +++++++++--------- 6 files changed, 336 insertions(+), 328 deletions(-) diff --git a/sdks/python/container/license_scripts/dep_urls_py.yaml b/sdks/python/container/license_scripts/dep_urls_py.yaml index 6fc5129e35c23..0fe830b7ab6ea 100644 --- a/sdks/python/container/license_scripts/dep_urls_py.yaml +++ b/sdks/python/container/license_scripts/dep_urls_py.yaml @@ -141,6 +141,8 @@ pip_dependencies: license: "https://raw.githubusercontent.com/jamescasbon/PyVCF/master/LICENSE" singledispatch: license: "file:///tmp/license_scripts/manual_licenses/singledispatch/LICENSE" + scikit-learn: + license: "https://raw.githubusercontent.com/scikit-learn/scikit-learn/master/COPYING" scipy: license: "https://raw.githubusercontent.com/scipy/scipy/master/LICENSE.txt" soupsieve: diff --git a/sdks/python/container/py310/base_image_requirements.txt b/sdks/python/container/py310/base_image_requirements.txt index e2806270e5443..2f736d0ebd26e 100644 --- a/sdks/python/container/py310/base_image_requirements.txt +++ b/sdks/python/container/py310/base_image_requirements.txt @@ -21,76 +21,77 @@ # https://s.apache.org/beam-python-dev-wiki # Reach out to a committer if you need help. -annotated-types==0.6.0 +annotated-types==0.7.0 async-timeout==4.0.3 -attrs==23.2.0 +attrs==24.2.0 beautifulsoup4==4.12.3 bs4==0.0.2 build==1.2.1 -cachetools==5.3.3 -certifi==2024.2.2 -cffi==1.16.0 +cachetools==5.4.0 +certifi==2024.7.4 +cffi==1.17.0 charset-normalizer==3.3.2 click==8.1.7 cloudpickle==2.2.1 cramjam==2.8.3 crcmod==1.7 -cryptography==42.0.5 +cryptography==43.0.0 Cython==0.29.37 -Deprecated==1.2.14 deprecation==2.1.0 dill==0.3.1.1 dnspython==2.6.1 -docker==7.0.0 +docker==7.1.0 docopt==0.6.2 docstring_parser==0.16 -exceptiongroup==1.2.0 +exceptiongroup==1.2.2 execnet==2.1.1 -fastavro==1.9.4 +fastavro==1.9.5 fasteners==0.19 -freezegun==1.4.0 +freezegun==1.5.1 future==1.0.0 -google-api-core==2.18.0 -google-api-python-client==2.126.0 +google-api-core==2.19.1 +google-api-python-client==2.140.0 google-apitools==0.5.31 -google-auth==2.29.0 +google-auth==2.33.0 google-auth-httplib2==0.2.0 -google-cloud-aiplatform==1.47.0 -google-cloud-bigquery==3.20.1 -google-cloud-bigquery-storage==2.24.0 -google-cloud-bigtable==2.23.1 +google-cloud-aiplatform==1.61.0 +google-cloud-bigquery==3.25.0 +google-cloud-bigquery-storage==2.25.0 +google-cloud-bigtable==2.25.0 google-cloud-core==2.4.1 -google-cloud-datastore==2.19.0 -google-cloud-dlp==3.16.0 -google-cloud-language==2.13.3 +google-cloud-datastore==2.20.0 +google-cloud-dlp==3.21.0 +google-cloud-language==2.14.0 google-cloud-profiler==4.1.0 -google-cloud-pubsub==2.21.1 -google-cloud-pubsublite==1.10.0 -google-cloud-recommendations-ai==0.10.10 -google-cloud-resource-manager==1.12.3 -google-cloud-spanner==3.44.0 -google-cloud-storage==2.16.0 -google-cloud-videointelligence==2.13.3 -google-cloud-vision==3.7.2 +google-cloud-pubsub==2.23.0 +google-cloud-pubsublite==1.11.1 +google-cloud-recommendations-ai==0.10.12 +google-cloud-resource-manager==1.12.5 +google-cloud-spanner==3.48.0 +google-cloud-storage==2.18.2 +google-cloud-videointelligence==2.13.5 +google-cloud-vision==3.7.4 google-crc32c==1.5.0 -google-resumable-media==2.7.0 -googleapis-common-protos==1.63.0 +google-resumable-media==2.7.2 +googleapis-common-protos==1.63.2 greenlet==3.0.3 -grpc-google-iam-v1==0.13.0 +grpc-google-iam-v1==0.13.1 grpc-interceptor==0.15.4 -grpcio==1.62.2 -grpcio-status==1.62.2 +grpcio==1.65.4 +grpcio-status==1.62.3 guppy3==3.1.4.post1 hdfs==2.7.3 httplib2==0.22.0 -hypothesis==6.100.1 +hypothesis==6.110.1 idna==3.7 iniconfig==2.0.0 -joblib==1.4.0 +Jinja2==3.0.3 +joblib==1.4.2 Js2Py==0.74 -jsonpickle==3.0.4 -jsonschema==4.21.1 +jsonpickle==3.2.2 +jsonschema==4.23.0 jsonschema-specifications==2023.12.1 +MarkupSafe==2.1.5 mmh3==4.1.0 mock==5.1.0 nltk==3.8.1 @@ -98,60 +99,60 @@ nose==1.3.7 numpy==1.26.4 oauth2client==4.1.3 objsize==0.7.0 -orjson==3.10.1 +orjson==3.10.7 overrides==7.7.0 -packaging==24.0 +packaging==24.1 pandas==2.1.4 parameterized==0.9.0 -pluggy==1.4.0 -proto-plus==1.23.0 -protobuf==4.25.3 +pluggy==1.5.0 +proto-plus==1.24.0 +protobuf==4.25.4 psycopg2-binary==2.9.9 -pyarrow==14.0.2 +pyarrow==16.1.0 pyarrow-hotfix==0.6 pyasn1==0.6.0 pyasn1_modules==0.4.0 pycparser==2.22 -pydantic==2.7.0 -pydantic_core==2.18.1 +pydantic==2.8.2 +pydantic_core==2.20.1 pydot==1.4.2 PyHamcrest==2.1.0 pyjsparser==2.7.1 -pymongo==4.6.3 -PyMySQL==1.1.0 +pymongo==4.8.0 +PyMySQL==1.1.1 pyparsing==3.1.2 -pyproject_hooks==1.0.0 +pyproject_hooks==1.1.0 pytest==7.4.4 pytest-timeout==2.3.1 -pytest-xdist==3.5.0 +pytest-xdist==3.6.1 python-dateutil==2.9.0.post0 -python-snappy==0.7.1 +python-snappy==0.7.2 pytz==2024.1 -PyYAML==6.0.1 -redis==5.0.3 -referencing==0.34.0 -regex==2024.4.16 +PyYAML==6.0.2 +redis==5.0.8 +referencing==0.35.1 +regex==2024.7.24 requests==2.31.0 requests-mock==1.12.1 -rpds-py==0.18.0 +rpds-py==0.20.0 rsa==4.9 -scikit-learn==1.4.2 -scipy==1.13.0 -shapely==2.0.4 +scikit-learn==1.5.1 +scipy==1.14.0 +shapely==2.0.5 six==1.16.0 sortedcontainers==2.4.0 soupsieve==2.5 -SQLAlchemy==2.0.29 -sqlparse==0.5.0 -tenacity==8.2.3 +SQLAlchemy==2.0.32 +sqlparse==0.5.1 +tenacity==8.5.0 testcontainers==3.7.1 -threadpoolctl==3.4.0 +threadpoolctl==3.5.0 tomli==2.0.1 -tqdm==4.66.2 -typing_extensions==4.11.0 +tqdm==4.66.5 +typing_extensions==4.12.2 tzdata==2024.1 tzlocal==5.2 uritemplate==4.1.1 -urllib3==2.2.1 +urllib3==2.2.2 wrapt==1.16.0 -zstandard==0.22.0 +zstandard==0.23.0 diff --git a/sdks/python/container/py311/base_image_requirements.txt b/sdks/python/container/py311/base_image_requirements.txt index 1cba006ad32cc..712986882a06e 100644 --- a/sdks/python/container/py311/base_image_requirements.txt +++ b/sdks/python/container/py311/base_image_requirements.txt @@ -21,74 +21,75 @@ # https://s.apache.org/beam-python-dev-wiki # Reach out to a committer if you need help. -annotated-types==0.6.0 -attrs==23.2.0 +annotated-types==0.7.0 +attrs==24.2.0 beautifulsoup4==4.12.3 bs4==0.0.2 build==1.2.1 -cachetools==5.3.3 -certifi==2024.2.2 -cffi==1.16.0 +cachetools==5.4.0 +certifi==2024.7.4 +cffi==1.17.0 charset-normalizer==3.3.2 click==8.1.7 cloudpickle==2.2.1 cramjam==2.8.3 crcmod==1.7 -cryptography==42.0.5 +cryptography==43.0.0 Cython==0.29.37 -Deprecated==1.2.14 deprecation==2.1.0 dill==0.3.1.1 dnspython==2.6.1 -docker==7.0.0 +docker==7.1.0 docopt==0.6.2 docstring_parser==0.16 execnet==2.1.1 -fastavro==1.9.4 +fastavro==1.9.5 fasteners==0.19 -freezegun==1.4.0 +freezegun==1.5.1 future==1.0.0 -google-api-core==2.18.0 -google-api-python-client==2.126.0 +google-api-core==2.19.1 +google-api-python-client==2.140.0 google-apitools==0.5.31 -google-auth==2.29.0 +google-auth==2.33.0 google-auth-httplib2==0.2.0 -google-cloud-aiplatform==1.47.0 -google-cloud-bigquery==3.20.1 -google-cloud-bigquery-storage==2.24.0 -google-cloud-bigtable==2.23.1 +google-cloud-aiplatform==1.61.0 +google-cloud-bigquery==3.25.0 +google-cloud-bigquery-storage==2.25.0 +google-cloud-bigtable==2.25.0 google-cloud-core==2.4.1 -google-cloud-datastore==2.19.0 -google-cloud-dlp==3.16.0 -google-cloud-language==2.13.3 +google-cloud-datastore==2.20.0 +google-cloud-dlp==3.21.0 +google-cloud-language==2.14.0 google-cloud-profiler==4.1.0 -google-cloud-pubsub==2.21.1 -google-cloud-pubsublite==1.10.0 -google-cloud-recommendations-ai==0.10.10 -google-cloud-resource-manager==1.12.3 -google-cloud-spanner==3.44.0 -google-cloud-storage==2.16.0 -google-cloud-videointelligence==2.13.3 -google-cloud-vision==3.7.2 +google-cloud-pubsub==2.23.0 +google-cloud-pubsublite==1.11.1 +google-cloud-recommendations-ai==0.10.12 +google-cloud-resource-manager==1.12.5 +google-cloud-spanner==3.48.0 +google-cloud-storage==2.18.2 +google-cloud-videointelligence==2.13.5 +google-cloud-vision==3.7.4 google-crc32c==1.5.0 -google-resumable-media==2.7.0 -googleapis-common-protos==1.63.0 +google-resumable-media==2.7.2 +googleapis-common-protos==1.63.2 greenlet==3.0.3 -grpc-google-iam-v1==0.13.0 +grpc-google-iam-v1==0.13.1 grpc-interceptor==0.15.4 -grpcio==1.62.2 -grpcio-status==1.62.2 +grpcio==1.65.4 +grpcio-status==1.62.3 guppy3==3.1.4.post1 hdfs==2.7.3 httplib2==0.22.0 -hypothesis==6.100.1 +hypothesis==6.110.1 idna==3.7 iniconfig==2.0.0 -joblib==1.4.0 +Jinja2==3.0.3 +joblib==1.4.2 Js2Py==0.74 -jsonpickle==3.0.4 -jsonschema==4.21.1 +jsonpickle==3.2.2 +jsonschema==4.23.0 jsonschema-specifications==2023.12.1 +MarkupSafe==2.1.5 mmh3==4.1.0 mock==5.1.0 nltk==3.8.1 @@ -96,59 +97,59 @@ nose==1.3.7 numpy==1.26.4 oauth2client==4.1.3 objsize==0.7.0 -orjson==3.10.1 +orjson==3.10.7 overrides==7.7.0 -packaging==24.0 +packaging==24.1 pandas==2.1.4 parameterized==0.9.0 -pluggy==1.4.0 -proto-plus==1.23.0 -protobuf==4.25.3 +pluggy==1.5.0 +proto-plus==1.24.0 +protobuf==4.25.4 psycopg2-binary==2.9.9 -pyarrow==14.0.2 +pyarrow==16.1.0 pyarrow-hotfix==0.6 pyasn1==0.6.0 pyasn1_modules==0.4.0 pycparser==2.22 -pydantic==2.7.0 -pydantic_core==2.18.1 +pydantic==2.8.2 +pydantic_core==2.20.1 pydot==1.4.2 PyHamcrest==2.1.0 pyjsparser==2.7.1 -pymongo==4.6.3 -PyMySQL==1.1.0 +pymongo==4.8.0 +PyMySQL==1.1.1 pyparsing==3.1.2 -pyproject_hooks==1.0.0 +pyproject_hooks==1.1.0 pytest==7.4.4 pytest-timeout==2.3.1 -pytest-xdist==3.5.0 +pytest-xdist==3.6.1 python-dateutil==2.9.0.post0 -python-snappy==0.7.1 +python-snappy==0.7.2 pytz==2024.1 -PyYAML==6.0.1 -redis==5.0.3 -referencing==0.34.0 -regex==2024.4.16 +PyYAML==6.0.2 +redis==5.0.8 +referencing==0.35.1 +regex==2024.7.24 requests==2.31.0 requests-mock==1.12.1 -rpds-py==0.18.0 +rpds-py==0.20.0 rsa==4.9 -scikit-learn==1.4.2 -scipy==1.13.0 -shapely==2.0.4 +scikit-learn==1.5.1 +scipy==1.14.0 +shapely==2.0.5 six==1.16.0 sortedcontainers==2.4.0 soupsieve==2.5 -SQLAlchemy==2.0.29 -sqlparse==0.5.0 -tenacity==8.2.3 +SQLAlchemy==2.0.32 +sqlparse==0.5.1 +tenacity==8.5.0 testcontainers==3.7.1 -threadpoolctl==3.4.0 -tqdm==4.66.2 -typing_extensions==4.11.0 +threadpoolctl==3.5.0 +tqdm==4.66.5 +typing_extensions==4.12.2 tzdata==2024.1 tzlocal==5.2 uritemplate==4.1.1 -urllib3==2.2.1 +urllib3==2.2.2 wrapt==1.16.0 -zstandard==0.22.0 +zstandard==0.23.0 diff --git a/sdks/python/container/py312/base_image_requirements.txt b/sdks/python/container/py312/base_image_requirements.txt index 4a6147b573b85..241d82913f2ec 100644 --- a/sdks/python/container/py312/base_image_requirements.txt +++ b/sdks/python/container/py312/base_image_requirements.txt @@ -21,72 +21,74 @@ # https://s.apache.org/beam-python-dev-wiki # Reach out to a committer if you need help. -annotated-types==0.6.0 -attrs==23.2.0 +annotated-types==0.7.0 +attrs==24.2.0 beautifulsoup4==4.12.3 bs4==0.0.2 build==1.2.1 -cachetools==5.3.3 -certifi==2024.2.2 -cffi==1.16.0 +cachetools==5.4.0 +certifi==2024.7.4 +cffi==1.17.0 charset-normalizer==3.3.2 click==8.1.7 cloudpickle==2.2.1 cramjam==2.8.3 crcmod==1.7 -cryptography==42.0.7 +cryptography==43.0.0 Cython==0.29.37 deprecation==2.1.0 dill==0.3.1.1 dnspython==2.6.1 -docker==7.0.0 +docker==7.1.0 docopt==0.6.2 docstring_parser==0.16 execnet==2.1.1 -fastavro==1.9.4 +fastavro==1.9.5 fasteners==0.19 -freezegun==1.5.0 +freezegun==1.5.1 future==1.0.0 -google-api-core==2.19.0 -google-api-python-client==2.128.0 +google-api-core==2.19.1 +google-api-python-client==2.140.0 google-apitools==0.5.31 -google-auth==2.29.0 +google-auth==2.33.0 google-auth-httplib2==0.2.0 -google-cloud-aiplatform==1.50.0 -google-cloud-bigquery==3.22.0 +google-cloud-aiplatform==1.61.0 +google-cloud-bigquery==3.25.0 google-cloud-bigquery-storage==2.25.0 -google-cloud-bigtable==2.23.1 +google-cloud-bigtable==2.25.0 google-cloud-core==2.4.1 -google-cloud-datastore==2.19.0 -google-cloud-dlp==3.16.0 -google-cloud-language==2.13.3 +google-cloud-datastore==2.20.0 +google-cloud-dlp==3.21.0 +google-cloud-language==2.14.0 google-cloud-profiler==4.1.0 -google-cloud-pubsub==2.21.1 -google-cloud-pubsublite==1.10.0 -google-cloud-recommendations-ai==0.10.10 -google-cloud-resource-manager==1.12.3 -google-cloud-spanner==3.46.0 -google-cloud-storage==2.16.0 -google-cloud-videointelligence==2.13.3 -google-cloud-vision==3.7.2 +google-cloud-pubsub==2.23.0 +google-cloud-pubsublite==1.11.1 +google-cloud-recommendations-ai==0.10.12 +google-cloud-resource-manager==1.12.5 +google-cloud-spanner==3.48.0 +google-cloud-storage==2.18.2 +google-cloud-videointelligence==2.13.5 +google-cloud-vision==3.7.4 google-crc32c==1.5.0 -google-resumable-media==2.7.0 -googleapis-common-protos==1.63.0 +google-resumable-media==2.7.2 +googleapis-common-protos==1.63.2 greenlet==3.0.3 -grpc-google-iam-v1==0.13.0 +grpc-google-iam-v1==0.13.1 grpc-interceptor==0.15.4 -grpcio==1.63.0 -grpcio-status==1.62.2 +grpcio==1.65.4 +grpcio-status==1.62.3 guppy3==3.1.4.post1 hdfs==2.7.3 httplib2==0.22.0 -hypothesis==6.100.5 +hypothesis==6.110.1 idna==3.7 iniconfig==2.0.0 +Jinja2==3.0.3 joblib==1.4.2 -jsonpickle==3.0.4 -jsonschema==4.22.0 +jsonpickle==3.2.2 +jsonschema==4.23.0 jsonschema-specifications==2023.12.1 +MarkupSafe==2.1.5 mmh3==4.1.0 mock==5.1.0 nltk==3.8.1 @@ -94,59 +96,59 @@ nose==1.3.7 numpy==1.26.4 oauth2client==4.1.3 objsize==0.7.0 -orjson==3.10.3 +orjson==3.10.7 overrides==7.7.0 -packaging==24.0 +packaging==24.1 pandas==2.1.4 parameterized==0.9.0 pluggy==1.5.0 -proto-plus==1.23.0 -protobuf==4.25.3 +proto-plus==1.24.0 +protobuf==4.25.4 psycopg2-binary==2.9.9 -pyarrow==14.0.2 +pyarrow==16.1.0 pyarrow-hotfix==0.6 pyasn1==0.6.0 pyasn1_modules==0.4.0 pycparser==2.22 -pydantic==2.7.1 -pydantic_core==2.18.2 +pydantic==2.8.2 +pydantic_core==2.20.1 pydot==1.4.2 PyHamcrest==2.1.0 -pymongo==4.7.1 -PyMySQL==1.1.0 +pymongo==4.8.0 +PyMySQL==1.1.1 pyparsing==3.1.2 pyproject_hooks==1.1.0 pytest==7.4.4 pytest-timeout==2.3.1 pytest-xdist==3.6.1 python-dateutil==2.9.0.post0 -python-snappy==0.7.1 +python-snappy==0.7.2 pytz==2024.1 -PyYAML==6.0.1 -redis==5.0.4 +PyYAML==6.0.2 +redis==5.0.8 referencing==0.35.1 -regex==2024.4.28 +regex==2024.7.24 requests==2.31.0 requests-mock==1.12.1 -rpds-py==0.18.1 +rpds-py==0.20.0 rsa==4.9 -scikit-learn==1.4.2 -scipy==1.13.0 -setuptools==69.5.1 -shapely==2.0.4 +scikit-learn==1.5.1 +scipy==1.14.0 +setuptools==72.1.0 +shapely==2.0.5 six==1.16.0 sortedcontainers==2.4.0 soupsieve==2.5 -SQLAlchemy==2.0.30 -sqlparse==0.5.0 -tenacity==8.3.0 +SQLAlchemy==2.0.32 +sqlparse==0.5.1 +tenacity==8.5.0 testcontainers==3.7.1 threadpoolctl==3.5.0 -tqdm==4.66.4 -typing_extensions==4.11.0 +tqdm==4.66.5 +typing_extensions==4.12.2 tzdata==2024.1 uritemplate==4.1.1 -urllib3==2.2.1 -wheel==0.43.0 +urllib3==2.2.2 +wheel==0.44.0 wrapt==1.16.0 -zstandard==0.22.0 +zstandard==0.23.0 diff --git a/sdks/python/container/py38/base_image_requirements.txt b/sdks/python/container/py38/base_image_requirements.txt index f88dba103469b..0c605548f9548 100644 --- a/sdks/python/container/py38/base_image_requirements.txt +++ b/sdks/python/container/py38/base_image_requirements.txt @@ -21,79 +21,80 @@ # https://s.apache.org/beam-python-dev-wiki # Reach out to a committer if you need help. -annotated-types==0.6.0 +annotated-types==0.7.0 async-timeout==4.0.3 -attrs==23.2.0 +attrs==24.2.0 backports.zoneinfo==0.2.1 beautifulsoup4==4.12.3 bs4==0.0.2 build==1.2.1 -cachetools==5.3.3 +cachetools==5.4.0 certifi==2024.7.4 -cffi==1.16.0 +cffi==1.17.0 charset-normalizer==3.3.2 click==8.1.7 cloudpickle==2.2.1 cramjam==2.8.3 crcmod==1.7 -cryptography==42.0.5 +cryptography==43.0.0 Cython==0.29.37 -Deprecated==1.2.14 deprecation==2.1.0 dill==0.3.1.1 dnspython==2.6.1 -docker==7.0.0 +docker==7.1.0 docopt==0.6.2 docstring_parser==0.16 -exceptiongroup==1.2.0 +exceptiongroup==1.2.2 execnet==2.1.1 -fastavro==1.9.4 +fastavro==1.9.5 fasteners==0.19 -freezegun==1.4.0 +freezegun==1.5.1 future==1.0.0 -google-api-core==2.18.0 -google-api-python-client==2.126.0 +google-api-core==2.19.1 +google-api-python-client==2.140.0 google-apitools==0.5.31 -google-auth==2.29.0 +google-auth==2.33.0 google-auth-httplib2==0.2.0 -google-cloud-aiplatform==1.47.0 -google-cloud-bigquery==3.20.1 -google-cloud-bigquery-storage==2.24.0 -google-cloud-bigtable==2.23.1 +google-cloud-aiplatform==1.61.0 +google-cloud-bigquery==3.25.0 +google-cloud-bigquery-storage==2.25.0 +google-cloud-bigtable==2.25.0 google-cloud-core==2.4.1 -google-cloud-datastore==2.19.0 -google-cloud-dlp==3.16.0 -google-cloud-language==2.13.3 +google-cloud-datastore==2.20.0 +google-cloud-dlp==3.21.0 +google-cloud-language==2.14.0 google-cloud-profiler==4.1.0 -google-cloud-pubsub==2.21.1 -google-cloud-pubsublite==1.10.0 -google-cloud-recommendations-ai==0.10.10 -google-cloud-resource-manager==1.12.3 -google-cloud-spanner==3.44.0 -google-cloud-storage==2.16.0 -google-cloud-videointelligence==2.13.3 -google-cloud-vision==3.7.2 +google-cloud-pubsub==2.23.0 +google-cloud-pubsublite==1.11.1 +google-cloud-recommendations-ai==0.10.12 +google-cloud-resource-manager==1.12.5 +google-cloud-spanner==3.48.0 +google-cloud-storage==2.18.2 +google-cloud-videointelligence==2.13.5 +google-cloud-vision==3.7.4 google-crc32c==1.5.0 -google-resumable-media==2.7.0 -googleapis-common-protos==1.63.0 +google-resumable-media==2.7.2 +googleapis-common-protos==1.63.2 greenlet==3.0.3 -grpc-google-iam-v1==0.13.0 +grpc-google-iam-v1==0.13.1 grpc-interceptor==0.15.4 -grpcio==1.62.2 -grpcio-status==1.62.2 +grpcio==1.65.4 +grpcio-status==1.62.3 guppy3==3.1.4.post1 hdfs==2.7.3 httplib2==0.22.0 -hypothesis==6.100.1 +hypothesis==6.110.1 idna==3.7 -importlib_metadata==7.1.0 +importlib_metadata==8.2.0 importlib_resources==6.4.0 iniconfig==2.0.0 -joblib==1.4.0 +Jinja2==3.0.3 +joblib==1.4.2 Js2Py==0.74 -jsonpickle==3.0.4 -jsonschema==4.21.1 +jsonpickle==3.2.2 +jsonschema==4.23.0 jsonschema-specifications==2023.12.1 +MarkupSafe==2.1.5 mmh3==4.1.0 mock==5.1.0 nltk==3.8.1 @@ -101,62 +102,62 @@ nose==1.3.7 numpy==1.24.4 oauth2client==4.1.3 objsize==0.7.0 -orjson==3.10.1 +orjson==3.10.7 overrides==7.7.0 -packaging==24.0 +packaging==24.1 pandas==2.0.3 parameterized==0.9.0 pkgutil_resolve_name==1.3.10 -pluggy==1.4.0 -proto-plus==1.23.0 -protobuf==4.25.3 +pluggy==1.5.0 +proto-plus==1.24.0 +protobuf==4.25.4 psycopg2-binary==2.9.9 -pyarrow==14.0.2 +pyarrow==16.1.0 pyarrow-hotfix==0.6 pyasn1==0.6.0 pyasn1_modules==0.4.0 pycparser==2.22 -pydantic==2.7.0 -pydantic_core==2.18.1 +pydantic==2.8.2 +pydantic_core==2.20.1 pydot==1.4.2 PyHamcrest==2.1.0 pyjsparser==2.7.1 -pymongo==4.6.3 -PyMySQL==1.1.0 +pymongo==4.8.0 +PyMySQL==1.1.1 pyparsing==3.1.2 -pyproject_hooks==1.0.0 +pyproject_hooks==1.1.0 pytest==7.4.4 pytest-timeout==2.3.1 -pytest-xdist==3.5.0 +pytest-xdist==3.6.1 python-dateutil==2.9.0.post0 -python-snappy==0.7.1 +python-snappy==0.7.2 pytz==2024.1 -PyYAML==6.0.1 -redis==5.0.3 -referencing==0.34.0 -regex==2024.4.16 +PyYAML==6.0.2 +redis==5.0.8 +referencing==0.35.1 +regex==2024.7.24 requests==2.31.0 requests-mock==1.12.1 -rpds-py==0.18.0 +rpds-py==0.20.0 rsa==4.9 scikit-learn==1.3.2 scipy==1.10.1 -shapely==2.0.4 +shapely==2.0.5 six==1.16.0 sortedcontainers==2.4.0 soupsieve==2.5 -SQLAlchemy==2.0.29 -sqlparse==0.5.0 -tenacity==8.2.3 +SQLAlchemy==2.0.32 +sqlparse==0.5.1 +tenacity==8.5.0 testcontainers==3.7.1 -threadpoolctl==3.4.0 +threadpoolctl==3.5.0 tomli==2.0.1 -tqdm==4.66.3 -typing_extensions==4.11.0 +tqdm==4.66.5 +typing_extensions==4.12.2 tzdata==2024.1 tzlocal==5.2 uritemplate==4.1.1 -urllib3==2.2.1 +urllib3==2.2.2 wrapt==1.16.0 -zipp==3.18.1 -zstandard==0.22.0 +zipp==3.19.2 +zstandard==0.23.0 diff --git a/sdks/python/container/py39/base_image_requirements.txt b/sdks/python/container/py39/base_image_requirements.txt index 39b888dd6ab75..52a7136ecffa7 100644 --- a/sdks/python/container/py39/base_image_requirements.txt +++ b/sdks/python/container/py39/base_image_requirements.txt @@ -21,77 +21,78 @@ # https://s.apache.org/beam-python-dev-wiki # Reach out to a committer if you need help. -annotated-types==0.6.0 +annotated-types==0.7.0 async-timeout==4.0.3 -attrs==23.2.0 +attrs==24.2.0 beautifulsoup4==4.12.3 bs4==0.0.2 build==1.2.1 -cachetools==5.3.3 -certifi==2024.2.2 -cffi==1.16.0 +cachetools==5.4.0 +certifi==2024.7.4 +cffi==1.17.0 charset-normalizer==3.3.2 click==8.1.7 cloudpickle==2.2.1 cramjam==2.8.3 crcmod==1.7 -cryptography==42.0.5 +cryptography==43.0.0 Cython==0.29.37 -Deprecated==1.2.14 deprecation==2.1.0 dill==0.3.1.1 dnspython==2.6.1 -docker==7.0.0 +docker==7.1.0 docopt==0.6.2 docstring_parser==0.16 -exceptiongroup==1.2.0 +exceptiongroup==1.2.2 execnet==2.1.1 -fastavro==1.9.4 +fastavro==1.9.5 fasteners==0.19 -freezegun==1.4.0 +freezegun==1.5.1 future==1.0.0 -google-api-core==2.18.0 -google-api-python-client==2.126.0 +google-api-core==2.19.1 +google-api-python-client==2.140.0 google-apitools==0.5.31 -google-auth==2.29.0 +google-auth==2.33.0 google-auth-httplib2==0.2.0 -google-cloud-aiplatform==1.47.0 -google-cloud-bigquery==3.20.1 -google-cloud-bigquery-storage==2.24.0 -google-cloud-bigtable==2.23.1 +google-cloud-aiplatform==1.61.0 +google-cloud-bigquery==3.25.0 +google-cloud-bigquery-storage==2.25.0 +google-cloud-bigtable==2.25.0 google-cloud-core==2.4.1 -google-cloud-datastore==2.19.0 -google-cloud-dlp==3.16.0 -google-cloud-language==2.13.3 +google-cloud-datastore==2.20.0 +google-cloud-dlp==3.21.0 +google-cloud-language==2.14.0 google-cloud-profiler==4.1.0 -google-cloud-pubsub==2.21.1 -google-cloud-pubsublite==1.10.0 -google-cloud-recommendations-ai==0.10.10 -google-cloud-resource-manager==1.12.3 -google-cloud-spanner==3.44.0 -google-cloud-storage==2.16.0 -google-cloud-videointelligence==2.13.3 -google-cloud-vision==3.7.2 +google-cloud-pubsub==2.23.0 +google-cloud-pubsublite==1.11.1 +google-cloud-recommendations-ai==0.10.12 +google-cloud-resource-manager==1.12.5 +google-cloud-spanner==3.48.0 +google-cloud-storage==2.18.2 +google-cloud-videointelligence==2.13.5 +google-cloud-vision==3.7.4 google-crc32c==1.5.0 -google-resumable-media==2.7.0 -googleapis-common-protos==1.63.0 +google-resumable-media==2.7.2 +googleapis-common-protos==1.63.2 greenlet==3.0.3 -grpc-google-iam-v1==0.13.0 +grpc-google-iam-v1==0.13.1 grpc-interceptor==0.15.4 -grpcio==1.62.2 -grpcio-status==1.62.2 +grpcio==1.65.4 +grpcio-status==1.62.3 guppy3==3.1.4.post1 hdfs==2.7.3 httplib2==0.22.0 -hypothesis==6.100.1 +hypothesis==6.110.1 idna==3.7 -importlib_metadata==7.1.0 +importlib_metadata==8.2.0 iniconfig==2.0.0 -joblib==1.4.0 +Jinja2==3.0.3 +joblib==1.4.2 Js2Py==0.74 -jsonpickle==3.0.4 -jsonschema==4.21.1 +jsonpickle==3.2.2 +jsonschema==4.23.0 jsonschema-specifications==2023.12.1 +MarkupSafe==2.1.5 mmh3==4.1.0 mock==5.1.0 nltk==3.8.1 @@ -99,61 +100,61 @@ nose==1.3.7 numpy==1.26.4 oauth2client==4.1.3 objsize==0.7.0 -orjson==3.10.1 +orjson==3.10.7 overrides==7.7.0 -packaging==24.0 +packaging==24.1 pandas==2.1.4 parameterized==0.9.0 -pluggy==1.4.0 -proto-plus==1.23.0 -protobuf==4.25.3 +pluggy==1.5.0 +proto-plus==1.24.0 +protobuf==4.25.4 psycopg2-binary==2.9.9 -pyarrow==14.0.2 +pyarrow==16.1.0 pyarrow-hotfix==0.6 pyasn1==0.6.0 pyasn1_modules==0.4.0 pycparser==2.22 -pydantic==2.7.0 -pydantic_core==2.18.1 +pydantic==2.8.2 +pydantic_core==2.20.1 pydot==1.4.2 PyHamcrest==2.1.0 pyjsparser==2.7.1 -pymongo==4.6.3 -PyMySQL==1.1.0 +pymongo==4.8.0 +PyMySQL==1.1.1 pyparsing==3.1.2 -pyproject_hooks==1.0.0 +pyproject_hooks==1.1.0 pytest==7.4.4 pytest-timeout==2.3.1 -pytest-xdist==3.5.0 +pytest-xdist==3.6.1 python-dateutil==2.9.0.post0 -python-snappy==0.7.1 +python-snappy==0.7.2 pytz==2024.1 -PyYAML==6.0.1 -redis==5.0.3 -referencing==0.34.0 -regex==2024.4.16 +PyYAML==6.0.2 +redis==5.0.8 +referencing==0.35.1 +regex==2024.7.24 requests==2.31.0 requests-mock==1.12.1 -rpds-py==0.18.0 +rpds-py==0.20.0 rsa==4.9 -scikit-learn==1.4.2 -scipy==1.13.0 -shapely==2.0.4 +scikit-learn==1.5.1 +scipy==1.13.1 +shapely==2.0.5 six==1.16.0 sortedcontainers==2.4.0 soupsieve==2.5 -SQLAlchemy==2.0.29 -sqlparse==0.5.0 -tenacity==8.2.3 +SQLAlchemy==2.0.32 +sqlparse==0.5.1 +tenacity==8.5.0 testcontainers==3.7.1 -threadpoolctl==3.4.0 +threadpoolctl==3.5.0 tomli==2.0.1 -tqdm==4.66.2 -typing_extensions==4.11.0 +tqdm==4.66.5 +typing_extensions==4.12.2 tzdata==2024.1 tzlocal==5.2 uritemplate==4.1.1 -urllib3==2.2.1 +urllib3==2.2.2 wrapt==1.16.0 -zipp==3.18.1 -zstandard==0.22.0 +zipp==3.19.2 +zstandard==0.23.0 From 82c3b36af70f6d4fa90d69963da03284c0dd6d28 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 9 Aug 2024 10:53:17 -0400 Subject: [PATCH 50/78] Bump golang.org/x/oauth2 from 0.21.0 to 0.22.0 in /sdks (#32129) Bumps [golang.org/x/oauth2](https://github.com/golang/oauth2) from 0.21.0 to 0.22.0. - [Commits](https://github.com/golang/oauth2/compare/v0.21.0...v0.22.0) --- updated-dependencies: - dependency-name: golang.org/x/oauth2 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- sdks/go.mod | 2 +- sdks/go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sdks/go.mod b/sdks/go.mod index 8c7a52ca951c7..9e32b2ba7b356 100644 --- a/sdks/go.mod +++ b/sdks/go.mod @@ -54,7 +54,7 @@ require ( github.com/xitongsys/parquet-go-source v0.0.0-20220315005136-aec0fe3e777c go.mongodb.org/mongo-driver v1.16.1 golang.org/x/net v0.28.0 - golang.org/x/oauth2 v0.21.0 + golang.org/x/oauth2 v0.22.0 golang.org/x/sync v0.8.0 golang.org/x/sys v0.23.0 golang.org/x/text v0.17.0 diff --git a/sdks/go.sum b/sdks/go.sum index e0039f28e43ba..af1d7b4ba828f 100644 --- a/sdks/go.sum +++ b/sdks/go.sum @@ -1408,8 +1408,8 @@ golang.org/x/oauth2 v0.4.0/go.mod h1:RznEsdpjGAINPTOF0UH/t+xJ75L18YO3Ho6Pyn+uRec golang.org/x/oauth2 v0.5.0/go.mod h1:9/XBHVqLaWO3/BRHs5jbpYCnOZVjj5V0ndyaAM7KB4I= golang.org/x/oauth2 v0.6.0/go.mod h1:ycmewcwgD4Rpr3eZJLSB4Kyyljb3qDh40vJ8STE5HKw= golang.org/x/oauth2 v0.7.0/go.mod h1:hPLQkd9LyjfXTiRohC/41GhcFqxisoUQ99sCUOHO9x4= -golang.org/x/oauth2 v0.21.0 h1:tsimM75w1tF/uws5rbeHzIWxEqElMehnc+iW793zsZs= -golang.org/x/oauth2 v0.21.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= +golang.org/x/oauth2 v0.22.0 h1:BzDx2FehcG7jJwgWLELCdmLuxk2i+x9UDpSiss2u0ZA= +golang.org/x/oauth2 v0.22.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= From 4b476832020c593bf79d6c06b370efb0cd3b03c5 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 9 Aug 2024 10:54:06 -0400 Subject: [PATCH 51/78] Bump github.com/fsouza/fake-gcs-server from 1.47.7 to 1.49.2 in /sdks (#32124) Bumps [github.com/fsouza/fake-gcs-server](https://github.com/fsouza/fake-gcs-server) from 1.47.7 to 1.49.2. - [Release notes](https://github.com/fsouza/fake-gcs-server/releases) - [Commits](https://github.com/fsouza/fake-gcs-server/compare/v1.47.7...v1.49.2) --- updated-dependencies: - dependency-name: github.com/fsouza/fake-gcs-server dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- sdks/go.mod | 2 +- sdks/go.sum | 16 ++++------------ 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/sdks/go.mod b/sdks/go.mod index 9e32b2ba7b356..624cc0ab1ce82 100644 --- a/sdks/go.mod +++ b/sdks/go.mod @@ -68,7 +68,7 @@ require ( require ( github.com/avast/retry-go/v4 v4.6.0 - github.com/fsouza/fake-gcs-server v1.47.7 + github.com/fsouza/fake-gcs-server v1.49.2 golang.org/x/exp v0.0.0-20231006140011-7918f672742d ) diff --git a/sdks/go.sum b/sdks/go.sum index af1d7b4ba828f..67686da8e408a 100644 --- a/sdks/go.sum +++ b/sdks/go.sum @@ -813,8 +813,8 @@ github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSw github.com/fogleman/gg v1.2.1-0.20190220221249-0403632d5b90/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k= github.com/fogleman/gg v1.3.0/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k= github.com/form3tech-oss/jwt-go v3.2.2+incompatible/go.mod h1:pbq4aXjuKjdthFRnoDwaVPLA+WlJuPGy+QneDUgJi2k= -github.com/fsouza/fake-gcs-server v1.47.7 h1:56/U4rKY081TaNbq0gHWi7/71UxC2KROqcnrD9BRJhs= -github.com/fsouza/fake-gcs-server v1.47.7/go.mod h1:4vPUynN8/zZlxk5Jpy6LvvTTxItdTAObK4DYnp89Jys= +github.com/fsouza/fake-gcs-server v1.49.2 h1:fukDqzEQM50QkA0jAbl6cLqeDu3maQjwZBuys759TR4= +github.com/fsouza/fake-gcs-server v1.49.2/go.mod h1:17SYzJEXRcaAA5ATwwvgBkSIqIy7r1icnGM0y/y4foY= github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= github.com/go-fonts/dejavu v0.1.0/go.mod h1:4Wt4I4OU2Nq9asgDCteaAaWZOV24E+0/Pwo0gppep4g= github.com/go-fonts/latin-modern v0.2.0/go.mod h1:rQVLdDMK+mK1xscDwsqM5J8U2jrRa3T0ecnM9pNujks= @@ -1008,8 +1008,6 @@ github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHW github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U= github.com/johannesboyne/gofakes3 v0.0.0-20221110173912-32fb85c5aed6 h1:eQGUsj2LcsLzfrHY1noKDSU7h+c9/rw9pQPwbQ9g1jQ= github.com/johannesboyne/gofakes3 v0.0.0-20221110173912-32fb85c5aed6/go.mod h1:LIAXxPvcUXwOcTIj9LSNSUpE9/eMHalTWxsP/kmWxQI= -github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= -github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU= github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/XSXhF0NWZEnDohbsk= github.com/jung-kurt/gofpdf v1.0.0/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes= @@ -1057,10 +1055,8 @@ github.com/minio/highwayhash v1.0.3 h1:kbnuUMoHYyVl7szWjSxJnxw11k2U709jqFPPmIUyD github.com/minio/highwayhash v1.0.3/go.mod h1:GGYsuwP/fPD6Y9hMiXuapVvlIUEhFhMTh0rxU3ik1LQ= github.com/minio/md5-simd v1.1.2 h1:Gdi1DZK69+ZVMoNHRXJyNcxrMA4dSxoYHZSQbirFg34= github.com/minio/md5-simd v1.1.2/go.mod h1:MzdKDxYpY2BT9XQFocsiZf/NKVtR7nkE4RoEpN+20RM= -github.com/minio/minio-go/v7 v7.0.66 h1:bnTOXOHjOqv/gcMuiVbN9o2ngRItvqE774dG9nq0Dzw= -github.com/minio/minio-go/v7 v7.0.66/go.mod h1:DHAgmyQEGdW3Cif0UooKOyrT3Vxs82zNdV6tkKhRtbs= -github.com/minio/sha256-simd v1.0.1 h1:6kaan5IFmwTNynnKKpDHe6FWHohJOHhCPchzK49dzMM= -github.com/minio/sha256-simd v1.0.1/go.mod h1:Pz6AKMiUdngCLpeTL/RJY1M9rUuPMYujV5xJjtbRSN8= +github.com/minio/minio-go/v7 v7.0.71 h1:No9XfOKTYi6i0GnBj+WZwD8WP5GZfL7n7GOjRqCdAjA= +github.com/minio/minio-go/v7 v7.0.71/go.mod h1:4yBA8v80xGA30cfM3fz0DKYMXunWl/AV/6tWEs9ryzo= github.com/moby/docker-image-spec v1.3.1 h1:jMKff3w6PgbfSa69GfNg+zN/XLhfXJGnEx3Nl2EsFP0= github.com/moby/docker-image-spec v1.3.1/go.mod h1:eKmb5VW8vQEh/BAr2yvVNvuiJuY6UIocYsFu/DxxRpo= github.com/moby/patternmatcher v0.6.0 h1:GmP9lR19aU5GqSSFko+5pRqHi+Ohk1O69aFiKkVGiPk= @@ -1071,10 +1067,6 @@ github.com/moby/sys/user v0.1.0 h1:WmZ93f5Ux6het5iituh9x2zAG7NFY9Aqi49jjE1PaQg= github.com/moby/sys/user v0.1.0/go.mod h1:fKJhFOnsCN6xZ5gSfbM6zaHGgDJMrqt9/reuj4T7MmU= github.com/moby/term v0.5.0 h1:xt8Q1nalod/v7BqbG21f8mQPqH+xAaC9C3N3wfWbVP0= github.com/moby/term v0.5.0/go.mod h1:8FzsFHVUBGZdbDsJw/ot+X+d5HLUbvklYLJ9uGfcI3Y= -github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= -github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= -github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= -github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/montanaflynn/stats v0.7.1 h1:etflOAAHORrCC44V+aR6Ftzort912ZU+YLiSTuV8eaE= github.com/montanaflynn/stats v0.7.1/go.mod h1:etXPPgVO6n31NxCd9KQUMvCM+ve0ruNzt6R8Bnaayow= github.com/morikuni/aec v1.0.0 h1:nP9CBfwrvYnBRgY6qfDQkygYDmYwOilePFkwzv4dU8A= From cae9148ee1ddbd0c46769e9ecfd963c47d66fcca Mon Sep 17 00:00:00 2001 From: Shunping Huang Date: Fri, 9 Aug 2024 10:57:31 -0400 Subject: [PATCH 52/78] Bump up google-cloud-storage version to fix data corruption issue (#32135) --- sdks/python/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/python/setup.py b/sdks/python/setup.py index 756c952b0101b..110ff6a89882f 100644 --- a/sdks/python/setup.py +++ b/sdks/python/setup.py @@ -440,7 +440,7 @@ def get_portability_package_data(): 'google-cloud-datastore>=2.0.0,<3', 'google-cloud-pubsub>=2.1.0,<3', 'google-cloud-pubsublite>=1.2.0,<2', - 'google-cloud-storage>=2.16.0,<3', + 'google-cloud-storage>=2.18.2,<3', # GCP packages required by tests 'google-cloud-bigquery>=2.0.0,<4', 'google-cloud-bigquery-storage>=2.6.3,<3', From 1c0cfa1ccbae2ff8bdca562a9d52757197cd4ddf Mon Sep 17 00:00:00 2001 From: Bartosz Zablocki Date: Fri, 9 Aug 2024 19:57:48 +0200 Subject: [PATCH 53/78] Expose watermarkIdleDurationThreshold parameter to the user in SolaceIO (#32109) --- .../apache/beam/sdk/io/solace/SolaceIO.java | 28 +++++++++++++++++-- .../io/solace/read/UnboundedSolaceReader.java | 4 ++- .../io/solace/read/UnboundedSolaceSource.java | 9 ++++++ .../io/solace/read/WatermarkParameters.java | 26 +---------------- .../sdk/io/solace/read/WatermarkPolicy.java | 10 +++++-- .../beam/sdk/io/solace/SolaceIOTest.java | 3 +- 6 files changed, 48 insertions(+), 32 deletions(-) diff --git a/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/SolaceIO.java b/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/SolaceIO.java index bb9f0c6ea689b..dcfdcc4fabb9c 100644 --- a/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/SolaceIO.java +++ b/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/SolaceIO.java @@ -51,6 +51,7 @@ import org.apache.beam.sdk.values.TypeDescriptor; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; import org.checkerframework.checker.nullness.qual.Nullable; +import org.joda.time.Duration; import org.joda.time.Instant; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -393,7 +394,8 @@ public class SolaceIO { } }; private static final boolean DEFAULT_DEDUPLICATE_RECORDS = false; - + private static final Duration DEFAULT_WATERMARK_IDLE_DURATION_THRESHOLD = + Duration.standardSeconds(30); public static final int DEFAULT_WRITER_MAX_NUMBER_OF_WORKERS = 20; public static final int DEFAULT_WRITER_CLIENTS_PER_WORKER = 4; public static final Boolean DEFAULT_WRITER_PUBLISH_LATENCY_METRICS = false; @@ -440,7 +442,8 @@ public static Read read() { .setTypeDescriptor(TypeDescriptor.of(Solace.Record.class)) .setParseFn(SolaceRecordMapper::map) .setTimestampFn(SENDER_TIMESTAMP_FUNCTION) - .setDeduplicateRecords(DEFAULT_DEDUPLICATE_RECORDS)); + .setDeduplicateRecords(DEFAULT_DEDUPLICATE_RECORDS) + .setWatermarkIdleDurationThreshold(DEFAULT_WATERMARK_IDLE_DURATION_THRESHOLD)); } /** * Create a {@link Read} transform, to read from Solace. Specify a {@link SerializableFunction} to @@ -467,7 +470,8 @@ public static Read read( .setTypeDescriptor(typeDescriptor) .setParseFn(parseFn) .setTimestampFn(timestampFn) - .setDeduplicateRecords(DEFAULT_DEDUPLICATE_RECORDS)); + .setDeduplicateRecords(DEFAULT_DEDUPLICATE_RECORDS) + .setWatermarkIdleDurationThreshold(DEFAULT_WATERMARK_IDLE_DURATION_THRESHOLD)); } /** @@ -540,6 +544,19 @@ public Read withMaxNumConnections(Integer maxNumConnections) { return this; } + /** + * Optional. Denotes the duration for which the watermark can be idle. If there are no incoming + * messages for this ‘idle’ period of time, the watermark is set to a timestamp representing a + * time earlier than now by the ‘idle’ period of time (e.g. if the ‘idle’ period of time is set + * to 30 seconds, and there is no new data incoming for 30 seconds, the watermark will be set to + * max(currentWatermark, now() - 30 seconds). The default watermark idle duration threshold is + * {@link #DEFAULT_WATERMARK_IDLE_DURATION_THRESHOLD}. + */ + public Read withWatermarkIdleDurationThreshold(Duration idleDurationThreshold) { + configurationBuilder.setWatermarkIdleDurationThreshold(idleDurationThreshold); + return this; + } + /** * Optional, default: false. Set to deduplicate messages based on the {@link * BytesXMLMessage#getApplicationMessageId()} of the incoming {@link BytesXMLMessage}. If the @@ -652,6 +669,8 @@ abstract static class Configuration { abstract TypeDescriptor getTypeDescriptor(); + abstract Duration getWatermarkIdleDurationThreshold(); + public static Builder builder() { Builder builder = new org.apache.beam.sdk.io.solace.AutoValue_SolaceIO_Read_Configuration.Builder(); @@ -680,6 +699,8 @@ abstract Builder setParseFn( abstract Builder setTypeDescriptor(TypeDescriptor typeDescriptor); + abstract Builder setWatermarkIdleDurationThreshold(Duration idleDurationThreshold); + abstract Configuration build(); } } @@ -716,6 +737,7 @@ public PCollection expand(PBegin input) { configuration.getDeduplicateRecords(), coder, configuration.getTimestampFn(), + configuration.getWatermarkIdleDurationThreshold(), configuration.getParseFn()))); } diff --git a/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/read/UnboundedSolaceReader.java b/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/read/UnboundedSolaceReader.java index 0155345a23236..c18a9d110b2ad 100644 --- a/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/read/UnboundedSolaceReader.java +++ b/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/read/UnboundedSolaceReader.java @@ -62,7 +62,9 @@ class UnboundedSolaceReader extends UnboundedReader { public UnboundedSolaceReader(UnboundedSolaceSource currentSource) { this.currentSource = currentSource; - this.watermarkPolicy = WatermarkPolicy.create(currentSource.getTimestampFn()); + this.watermarkPolicy = + WatermarkPolicy.create( + currentSource.getTimestampFn(), currentSource.getWatermarkIdleDurationThreshold()); this.sessionService = currentSource.getSessionServiceFactory().create(); this.sempClient = currentSource.getSempClientFactory().create(); } diff --git a/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/read/UnboundedSolaceSource.java b/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/read/UnboundedSolaceSource.java index 370159994941b..1cb17a49fbdba 100644 --- a/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/read/UnboundedSolaceSource.java +++ b/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/read/UnboundedSolaceSource.java @@ -31,6 +31,7 @@ import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.sdk.transforms.SerializableFunction; import org.checkerframework.checker.nullness.qual.Nullable; +import org.joda.time.Duration; import org.joda.time.Instant; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -46,6 +47,7 @@ public class UnboundedSolaceSource extends UnboundedSource timestampFn; + private final Duration watermarkIdleDurationThreshold; private final SerializableFunction<@Nullable BytesXMLMessage, @Nullable T> parseFn; public Queue getQueue() { @@ -64,6 +66,10 @@ public SerializableFunction getTimestampFn() { return timestampFn; } + public Duration getWatermarkIdleDurationThreshold() { + return watermarkIdleDurationThreshold; + } + public SerializableFunction<@Nullable BytesXMLMessage, @Nullable T> getParseFn() { return parseFn; } @@ -76,6 +82,7 @@ public UnboundedSolaceSource( boolean enableDeduplication, Coder coder, SerializableFunction timestampFn, + Duration watermarkIdleDurationThreshold, SerializableFunction<@Nullable BytesXMLMessage, @Nullable T> parseFn) { this.queue = queue; this.sempClientFactory = sempClientFactory; @@ -84,6 +91,7 @@ public UnboundedSolaceSource( this.enableDeduplication = enableDeduplication; this.coder = coder; this.timestampFn = timestampFn; + this.watermarkIdleDurationThreshold = watermarkIdleDurationThreshold; this.parseFn = parseFn; } @@ -125,6 +133,7 @@ private List> getSolaceSources( enableDeduplication, coder, timestampFn, + watermarkIdleDurationThreshold, parseFn); sourceList.add(source); } diff --git a/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/read/WatermarkParameters.java b/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/read/WatermarkParameters.java index f58cb1cc202d0..29b35d883f22f 100644 --- a/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/read/WatermarkParameters.java +++ b/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/read/WatermarkParameters.java @@ -21,7 +21,6 @@ import java.io.Serializable; import org.apache.beam.sdk.transforms.SerializableFunction; import org.apache.beam.sdk.transforms.windowing.BoundedWindow; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; import org.joda.time.Duration; import org.joda.time.Instant; @@ -29,9 +28,6 @@ @AutoValue abstract class WatermarkParameters implements Serializable { - private static final Duration STANDARD_WATERMARK_IDLE_DURATION_THRESHOLD = - Duration.standardSeconds(30); - abstract Instant getCurrentWatermark(); abstract Instant getLastSavedWatermark(); @@ -48,8 +44,7 @@ static Builder builder() { return new AutoValue_WatermarkParameters.Builder() .setCurrentWatermark(BoundedWindow.TIMESTAMP_MIN_VALUE) .setLastSavedWatermark(BoundedWindow.TIMESTAMP_MIN_VALUE) - .setLastUpdateTime(Instant.now()) - .setWatermarkIdleDurationThreshold(STANDARD_WATERMARK_IDLE_DURATION_THRESHOLD); + .setLastUpdateTime(Instant.now()); } @AutoValue.Builder @@ -66,23 +61,4 @@ abstract static class Builder { abstract WatermarkParameters build(); } - - /** - * Create an instance of {@link WatermarkParameters} with a {@code SerializableFunction} to - * extract the event time. - */ - static WatermarkParameters create(SerializableFunction timestampFn) { - Preconditions.checkArgument(timestampFn != null, "timestampFn function is null"); - return WatermarkParameters.builder().setTimestampFn(timestampFn).build(); - } - - /** - * Specify the watermark idle duration to consider before advancing the watermark. The default - * watermark idle duration threshold is {@link #STANDARD_WATERMARK_IDLE_DURATION_THRESHOLD}. - */ - WatermarkParameters withWatermarkIdleDurationThreshold(Duration idleDurationThreshold) { - Preconditions.checkArgument( - idleDurationThreshold != null, "watermark idle duration threshold is null"); - return toBuilder().setWatermarkIdleDurationThreshold(idleDurationThreshold).build(); - } } diff --git a/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/read/WatermarkPolicy.java b/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/read/WatermarkPolicy.java index 13d65639e3358..9d2ed24f3c06a 100644 --- a/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/read/WatermarkPolicy.java +++ b/sdks/java/io/solace/src/main/java/org/apache/beam/sdk/io/solace/read/WatermarkPolicy.java @@ -21,6 +21,7 @@ import org.apache.beam.sdk.transforms.SerializableFunction; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Ordering; import org.checkerframework.checker.nullness.qual.Nullable; +import org.joda.time.Duration; import org.joda.time.Instant; /** @@ -39,8 +40,13 @@ class WatermarkPolicy implements Serializable { private WatermarkParameters watermarkParameters; - static WatermarkPolicy create(SerializableFunction timestampFunction) { - return new WatermarkPolicy(WatermarkParameters.create(timestampFunction)); + static WatermarkPolicy create( + SerializableFunction timestampFunction, Duration watermarkIdleDurationThreshold) { + return new WatermarkPolicy( + WatermarkParameters.builder() + .setTimestampFn(timestampFunction) + .setWatermarkIdleDurationThreshold(watermarkIdleDurationThreshold) + .build()); } private WatermarkPolicy(WatermarkParameters watermarkParameters) { diff --git a/sdks/java/io/solace/src/test/java/org/apache/beam/sdk/io/solace/SolaceIOTest.java b/sdks/java/io/solace/src/test/java/org/apache/beam/sdk/io/solace/SolaceIOTest.java index bd9d5d401b548..cc1fa1d667aaf 100644 --- a/sdks/java/io/solace/src/test/java/org/apache/beam/sdk/io/solace/SolaceIOTest.java +++ b/sdks/java/io/solace/src/test/java/org/apache/beam/sdk/io/solace/SolaceIOTest.java @@ -95,6 +95,7 @@ private static UnboundedSolaceSource getSource(Read spec, TestPi configuration.getDeduplicateRecords(), spec.inferCoder(pipeline, configuration.getTypeDescriptor()), configuration.getTimestampFn(), + configuration.getWatermarkIdleDurationThreshold(), configuration.getParseFn()); } @@ -527,7 +528,7 @@ public void testCheckpointMarkSafety() throws Exception { @Test public void testDefaultCoder() { Coder coder = - new UnboundedSolaceSource<>(null, null, null, 0, false, null, null, null) + new UnboundedSolaceSource<>(null, null, null, 0, false, null, null, null, null) .getCheckpointMarkCoder(); CoderProperties.coderSerializable(coder); } From f73a6d1570acc2945db4d80206dd86d6054f2ac2 Mon Sep 17 00:00:00 2001 From: Timothy Itodo Date: Fri, 9 Aug 2024 14:06:02 -0500 Subject: [PATCH 54/78] Create Beam YAML Join documentation (#31494) --- .../en/documentation/sdks/yaml-join.md | 182 ++++++++++++++++++ .../partials/section-menu/en/sdks.html | 1 + 2 files changed, 183 insertions(+) create mode 100644 website/www/site/content/en/documentation/sdks/yaml-join.md diff --git a/website/www/site/content/en/documentation/sdks/yaml-join.md b/website/www/site/content/en/documentation/sdks/yaml-join.md new file mode 100644 index 0000000000000..d207926ff995b --- /dev/null +++ b/website/www/site/content/en/documentation/sdks/yaml-join.md @@ -0,0 +1,182 @@ +--- +type: languages +title: "Apache Beam YAML Join" +--- + + +# Beam YAML Join + +Beam YAML can join two or more inputs on specified columns. For example, the +following pipeline joins the First Input pcollection and Second Input +pcollection when col1 in First Input is equal to col2 in Second Input. + +``` +- type: Join + input: + input1: First Input + input2: Second Input + config: + equalities: + - input1: col1 + input2: col2 +``` + +When joining multiple inputs on one column that is named the same across all the +inputs, one can use the following shorthand syntax: + +``` +- type: Join + input: + input1: First Input + input2: Second Input + input3: Third Input + config: + equalities: col +``` + +## Join Types + +When using the Join transform, one can specify the type of join to perform on +the inputs. If no join type is specified, the inputs are all joined using an +inner join. The supported join types are: + +| Join Type | YAML Keyword | +| -------- | ------- | +| Inner Join | inner | +| Full Outer Join | left | +| Right Outer Join | right | + +The following example joins two inputs using an inner join on the specified +equalities: + +``` +- type: Join + input: + input1: First Input + input2: Second Input + config: + type: inner + equalities: + - input1: col1 + input2: col1 +``` + + +The following example joins two inputs using a left outer join on the specified +equalities. In this case, all rows from input1 will be kept because input1 is +the left input. Order of joins follows the sequence as specified in equalities. + +``` +- type: Join + input: + input1: First Input + input2: Second Input + config: + type: left + equalities: + - input1: col1 + input2: col1 +``` + +The following example joins three inputs using an full outer join on the +specified equalities: + +``` +- type: Join + input: + input1: First Input + input2: Second Input + input3: Third Input + config: + type: outer + equalities: + - input1: col1 + input2: col1 + - input2: col2 + input3: col2 +``` + +If you want a combination of join types, you can specify the inputs to be outer +joined. The following example joins input1 with input2 using a right outer join +since input2 is on the right side and will join input2 with input 3 using a left +outer join since input2 is on the left side. + +``` +- type: Join + input: + input1: First Input + input2: Second Input + input3: Third Input + config: + type: + outer: + - input2 + equalities: + - input1: col1 + input2: col1 + - input2: col2 + input3: col2 +``` + +## Fields +By default, the join transform includes all columns from all input tables. If +column names clash, it's best to rename them explicitly. Otherwise, the system +will deduplicate names by adding a numeric suffix + +To choose which columns to output, or to customize the output column names, use +the "fields" configuration. + +To specify which columns to output from an input, use the input reference as the +configuration key and a list of desired columns as the configuration value. The +following example outputs col1 from input1, col2 and col3 from input2, and all +the columns from input 3. If there is a name clash, it appends a numeric suffix +to avoid duplicate naming. + +``` +- type: Join + input: + input1: First Input + input2: Second Input + input3: Third Input + config: + equalities: col1 + fields: + input1: [col1] + input2: [col2, col3] +``` + +To rename a column in the output, create a mapping for the input with the key as +the new column name and the value as the original column name. The following +example maps col1 from input3 to the column name "renamed_col1": + +``` +- type: Join + input: + input1: First Input + input2: Second Input + input3: Third Input + config: + equalities: col1 + fields: + input1: [col1] + input2: [col2, col3] + input3: + renamed_col1: col1 +``` diff --git a/website/www/site/layouts/partials/section-menu/en/sdks.html b/website/www/site/layouts/partials/section-menu/en/sdks.html index fd7de314992b1..ea48eb6f40d9b 100644 --- a/website/www/site/layouts/partials/section-menu/en/sdks.html +++ b/website/www/site/layouts/partials/section-menu/en/sdks.html @@ -92,6 +92,7 @@
  • Yaml Aggregation
  • Error handling
  • Inlining Python
  • +
  • Yaml Join
  • YAML API reference External link. From ec152e283557a7b9ba273dac4b6fc6400786d2cf Mon Sep 17 00:00:00 2001 From: Robert Burke Date: Fri, 9 Aug 2024 13:31:17 -0700 Subject: [PATCH 55/78] [#32139] Fail pipelines with Stateful SDFs. (#32140) * [#32139] Fail pipelines with Stateful SDFs. * rm debug print --------- Co-authored-by: lostluck <13907733+lostluck@users.noreply.github.com> --- .../runners/prism/internal/jobservices/management.go | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/sdks/go/pkg/beam/runners/prism/internal/jobservices/management.go b/sdks/go/pkg/beam/runners/prism/internal/jobservices/management.go index 7676d958031c1..2b03eddff05d7 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/jobservices/management.go +++ b/sdks/go/pkg/beam/runners/prism/internal/jobservices/management.go @@ -158,17 +158,26 @@ func (s *Server) Prepare(ctx context.Context, req *jobpb.PrepareJobRequest) (*jo return nil, fmt.Errorf("unable to unmarshal ParDoPayload for %v - %q: %w", tid, t.GetUniqueName(), err) } + isStateful := false + // Validate all the state features for _, spec := range pardo.GetStateSpecs() { + isStateful = true check("StateSpec.Protocol.Urn", spec.GetProtocol().GetUrn(), urns.UserStateBag, urns.UserStateMultiMap) } // Validate all the timer features for _, spec := range pardo.GetTimerFamilySpecs() { + isStateful = true check("TimerFamilySpecs.TimeDomain.Urn", spec.GetTimeDomain(), pipepb.TimeDomain_EVENT_TIME, pipepb.TimeDomain_PROCESSING_TIME) } check("OnWindowExpirationTimerFamily", pardo.GetOnWindowExpirationTimerFamilySpec(), "") // Unsupported for now. + // Check for a stateful SDF and direct user to https://github.com/apache/beam/issues/32139 + if pardo.GetRestrictionCoderId() != "" && isStateful { + check("Splittable+Stateful DoFn", "See https://github.com/apache/beam/issues/32139 for information.", "") + } + case urns.TransformTestStream: var testStream pipepb.TestStreamPayload if err := proto.Unmarshal(t.GetSpec().GetPayload(), &testStream); err != nil { From 17298b5572e9b0d8aa8c4d0ca1e51c3f832c0067 Mon Sep 17 00:00:00 2001 From: Robert Burke Date: Fri, 9 Aug 2024 14:21:32 -0700 Subject: [PATCH 56/78] [#32115] Fix timer support, support timer clears. (#32119) --- .../prism/internal/engine/elementmanager.go | 22 ++-- .../runners/prism/internal/engine/timers.go | 115 +++++++++++------- .../runners/portability/prism_runner_test.py | 32 +++++ 3 files changed, 119 insertions(+), 50 deletions(-) diff --git a/sdks/go/pkg/beam/runners/prism/internal/engine/elementmanager.go b/sdks/go/pkg/beam/runners/prism/internal/engine/elementmanager.go index bc8449c72b39a..c73db507c7920 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/engine/elementmanager.go +++ b/sdks/go/pkg/beam/runners/prism/internal/engine/elementmanager.go @@ -869,14 +869,20 @@ func (em *ElementManager) triageTimers(d TentativeData, inputInfo PColInfo, stag for tentativeKey, timers := range d.timers { keyToTimers := map[timerKey]element{} for _, t := range timers { - key, tag, elms := decodeTimer(inputInfo.KeyDec, true, t) - for _, e := range elms { - keyToTimers[timerKey{key: string(key), tag: tag, win: e.window}] = e - } - if len(elms) == 0 { - // TODO(lostluck): Determine best way to mark a timer cleared. - continue - } + // TODO: Call in a for:range loop when Beam's minimum Go version hits 1.23.0 + iter := decodeTimerIter(inputInfo.KeyDec, true, t) + iter(func(ret timerRet) bool { + for _, e := range ret.elms { + keyToTimers[timerKey{key: string(ret.keyBytes), tag: ret.tag, win: e.window}] = e + } + if len(ret.elms) == 0 { + for _, w := range ret.windows { + delete(keyToTimers, timerKey{key: string(ret.keyBytes), tag: ret.tag, win: w}) + } + } + // Indicate we'd like to continue iterating. + return true + }) } for _, elm := range keyToTimers { diff --git a/sdks/go/pkg/beam/runners/prism/internal/engine/timers.go b/sdks/go/pkg/beam/runners/prism/internal/engine/timers.go index 787d27858a0e5..9a3bd6f9682bc 100644 --- a/sdks/go/pkg/beam/runners/prism/internal/engine/timers.go +++ b/sdks/go/pkg/beam/runners/prism/internal/engine/timers.go @@ -31,53 +31,77 @@ import ( "google.golang.org/protobuf/encoding/protowire" ) -// DecodeTimer extracts timers to elements for insertion into their keyed queues. -// Returns the key bytes, tag, window exploded elements, and the hold timestamp. +type timerRet struct { + keyBytes []byte + tag string + elms []element + windows []typex.Window +} + +// decodeTimerIter extracts timers to elements for insertion into their keyed queues, +// through a go iterator function, to be called by the caller with their processing function. +// +// For each timer, a key, tag, windowed elements, and the window set are returned. +// // If the timer has been cleared, no elements will be returned. Any existing timers -// for the tag *must* be cleared from the pending queue. -func decodeTimer(keyDec func(io.Reader) []byte, usesGlobalWindow bool, raw []byte) ([]byte, string, []element) { - keyBytes := keyDec(bytes.NewBuffer(raw)) - - d := decoder{raw: raw, cursor: len(keyBytes)} - tag := string(d.Bytes()) - - var ws []typex.Window - numWin := d.Fixed32() - if usesGlobalWindow { - for i := 0; i < int(numWin); i++ { - ws = append(ws, window.GlobalWindow{}) - } - } else { - // Assume interval windows here, since we don't understand custom windows yet. - for i := 0; i < int(numWin); i++ { - ws = append(ws, d.IntervalWindow()) - } - } +// for the tag *must* be cleared from the pending queue. The windows associated with +// the clear are provided to be able to delete pending timers. +func decodeTimerIter(keyDec func(io.Reader) []byte, usesGlobalWindow bool, raw []byte) func(func(timerRet) bool) { + return func(yield func(timerRet) bool) { + for len(raw) > 0 { + keyBytes := keyDec(bytes.NewBuffer(raw)) + d := decoder{raw: raw, cursor: len(keyBytes)} + tag := string(d.Bytes()) + + var ws []typex.Window + numWin := d.Fixed32() + if usesGlobalWindow { + for i := 0; i < int(numWin); i++ { + ws = append(ws, window.GlobalWindow{}) + } + } else { + // Assume interval windows here, since we don't understand custom windows yet. + for i := 0; i < int(numWin); i++ { + ws = append(ws, d.IntervalWindow()) + } + } - clear := d.Bool() - hold := mtime.MaxTimestamp - if clear { - return keyBytes, tag, nil - } + clear := d.Bool() + hold := mtime.MaxTimestamp + if clear { + if !yield(timerRet{keyBytes, tag, nil, ws}) { + return // Halt iteration if yeild returns false. + } + // Otherwise continue handling the remaining bytes. + raw = d.UnusedBytes() + continue + } - firing := d.Timestamp() - hold = d.Timestamp() - pane := d.Pane() + firing := d.Timestamp() + hold = d.Timestamp() + pane := d.Pane() + + var elms []element + for _, w := range ws { + elms = append(elms, element{ + tag: tag, + elmBytes: nil, // indicates this is a timer. + keyBytes: keyBytes, + window: w, + timestamp: firing, + holdTimestamp: hold, + pane: pane, + sequence: len(elms), + }) + } - var ret []element - for _, w := range ws { - ret = append(ret, element{ - tag: tag, - elmBytes: nil, // indicates this is a timer. - keyBytes: keyBytes, - window: w, - timestamp: firing, - holdTimestamp: hold, - pane: pane, - sequence: len(ret), - }) + if !yield(timerRet{keyBytes, tag, elms, ws}) { + return // Halt iteration if yeild returns false. + } + // Otherwise continue handling the remaining bytes. + raw = d.UnusedBytes() + } } - return keyBytes, tag, ret } type decoder struct { @@ -140,6 +164,13 @@ func (d *decoder) Bytes() []byte { return b } +// UnusedBytes returns the remainder of bytes in the buffer that weren't yet used. +// Multiple timers can be provided in a single timers buffer, since multiple dynamic +// timer tags may be set. +func (d *decoder) UnusedBytes() []byte { + return d.raw[d.cursor:] +} + func (d *decoder) Bool() bool { if b := d.Byte(); b == 0 { return false diff --git a/sdks/python/apache_beam/runners/portability/prism_runner_test.py b/sdks/python/apache_beam/runners/portability/prism_runner_test.py index 324fe5a17b545..b179156877e46 100644 --- a/sdks/python/apache_beam/runners/portability/prism_runner_test.py +++ b/sdks/python/apache_beam/runners/portability/prism_runner_test.py @@ -40,6 +40,7 @@ from apache_beam.runners.portability import portable_runner_test from apache_beam.testing.util import assert_that from apache_beam.testing.util import equal_to +from apache_beam.transforms import userstate from apache_beam.transforms import window from apache_beam.transforms.sql import SqlTransform from apache_beam.utils import timestamp @@ -200,6 +201,37 @@ def test_windowing(self): assert_that( res, equal_to([('k', [1, 2]), ('k', [100, 101, 102]), ('k', [123])])) + # The fn_runner_test.py version of this test doesn't execute the process + # method for some reason. Overridden here to validate that the cleared + # timer won't re-fire. + def test_pardo_timers_clear(self): + timer_spec = userstate.TimerSpec('timer', userstate.TimeDomain.WATERMARK) + + class TimerDoFn(beam.DoFn): + def process(self, element, timer=beam.DoFn.TimerParam(timer_spec)): + unused_key, ts = element + timer.set(ts) + timer.set(2 * ts) + + @userstate.on_timer(timer_spec) + def process_timer( + self, + ts=beam.DoFn.TimestampParam, + timer=beam.DoFn.TimerParam(timer_spec)): + timer.set(timestamp.Timestamp(micros=2 * ts.micros)) + timer.clear() # Shouldn't fire again + yield 'fired' + + with self.create_pipeline() as p: + actual = ( + p + | beam.Create([('k1', 10), ('k2', 100)]) + | beam.ParDo(TimerDoFn()) + | beam.Map(lambda x, ts=beam.DoFn.TimestampParam: (x, ts))) + + expected = [('fired', ts) for ts in (20, 200)] + assert_that(actual, equal_to(expected)) + # Can't read host files from within docker, read a "local" file there. def test_read(self): print('name:', __name__) From b21a84a4cd607f58a3794e274a913ca48da2e42c Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud <65791736+ahmedabu98@users.noreply.github.com> Date: Fri, 9 Aug 2024 17:38:35 -0400 Subject: [PATCH 57/78] Managed Iceberg hive support and integration tests (#32052) * iceberg hive support and integration tests * split read and write tests; cleanup * add test documentation * extend new config_properties arg to translation tests * revert beam schema override * actually run hive ITs * trigger integration tests * cut down hive database source lines --- .../IO_Iceberg_Integration_Tests.json | 2 +- .../IO_Iceberg_Integration_Tests.yml | 2 +- sdks/java/io/iceberg/build.gradle | 5 + sdks/java/io/iceberg/hive/build.gradle | 80 +++++ sdks/java/io/iceberg/hive/exec/build.gradle | 58 ++++ .../io/iceberg/hive/IcebergHiveCatalogIT.java | 280 ++++++++++++++++++ .../testutils/HiveMetastoreExtension.java | 68 +++++ .../iceberg/hive/testutils/ScriptRunner.java | 203 +++++++++++++ .../hive/testutils/TestHiveMetastore.java | 273 +++++++++++++++++ .../resources/hive-schema-3.1.0.derby.sql | 267 +++++++++++++++++ .../sdk/io/iceberg/IcebergCatalogConfig.java | 37 ++- .../IcebergReadSchemaTransformProvider.java | 56 +--- .../IcebergWriteSchemaTransformProvider.java | 58 +--- .../iceberg/SchemaTransformConfiguration.java | 69 +++++ .../sdk/io/iceberg/IcebergIOReadTest.java | 16 +- .../sdk/io/iceberg/IcebergIOWriteTest.java | 40 ++- ...cebergReadSchemaTransformProviderTest.java | 4 +- ...IcebergSchemaTransformTranslationTest.java | 6 + ...ebergWriteSchemaTransformProviderTest.java | 5 +- .../beam/sdk/io/iceberg/ScanSourceTest.java | 33 ++- settings.gradle.kts | 4 + 21 files changed, 1423 insertions(+), 143 deletions(-) create mode 100644 sdks/java/io/iceberg/hive/build.gradle create mode 100644 sdks/java/io/iceberg/hive/exec/build.gradle create mode 100644 sdks/java/io/iceberg/hive/src/test/java/org/apache/beam/sdk/io/iceberg/hive/IcebergHiveCatalogIT.java create mode 100644 sdks/java/io/iceberg/hive/src/test/java/org/apache/beam/sdk/io/iceberg/hive/testutils/HiveMetastoreExtension.java create mode 100644 sdks/java/io/iceberg/hive/src/test/java/org/apache/beam/sdk/io/iceberg/hive/testutils/ScriptRunner.java create mode 100644 sdks/java/io/iceberg/hive/src/test/java/org/apache/beam/sdk/io/iceberg/hive/testutils/TestHiveMetastore.java create mode 100644 sdks/java/io/iceberg/hive/src/test/resources/hive-schema-3.1.0.derby.sql create mode 100644 sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/SchemaTransformConfiguration.java diff --git a/.github/trigger_files/IO_Iceberg_Integration_Tests.json b/.github/trigger_files/IO_Iceberg_Integration_Tests.json index bbdc3a3910ef8..62ae7886c5731 100644 --- a/.github/trigger_files/IO_Iceberg_Integration_Tests.json +++ b/.github/trigger_files/IO_Iceberg_Integration_Tests.json @@ -1,4 +1,4 @@ { "comment": "Modify this file in a trivial way to cause this test suite to run", - "modification": 3 + "modification": 4 } diff --git a/.github/workflows/IO_Iceberg_Integration_Tests.yml b/.github/workflows/IO_Iceberg_Integration_Tests.yml index 20d1f4bb60fd3..22b2b4f9287d5 100644 --- a/.github/workflows/IO_Iceberg_Integration_Tests.yml +++ b/.github/workflows/IO_Iceberg_Integration_Tests.yml @@ -75,4 +75,4 @@ jobs: - name: Run IcebergIO Integration Test uses: ./.github/actions/gradle-command-self-hosted-action with: - gradle-command: :sdks:java:io:iceberg:integrationTest \ No newline at end of file + gradle-command: :sdks:java:io:iceberg:catalogTests \ No newline at end of file diff --git a/sdks/java/io/iceberg/build.gradle b/sdks/java/io/iceberg/build.gradle index 7965cde86e7d9..3d653d6b276e9 100644 --- a/sdks/java/io/iceberg/build.gradle +++ b/sdks/java/io/iceberg/build.gradle @@ -115,6 +115,11 @@ task integrationTest(type: Test) { testClassesDirs = sourceSets.test.output.classesDirs } +tasks.register('catalogTests') { + dependsOn integrationTest + dependsOn ":sdks:java:io:iceberg:hive:integrationTest" +} + task loadTest(type: Test) { def gcpProject = project.findProperty('gcpProject') ?: 'apache-beam-testing' def gcpTempLocation = project.findProperty('gcpTempLocation') ?: 'gs://temp-storage-for-end-to-end-tests/temp-lt' diff --git a/sdks/java/io/iceberg/hive/build.gradle b/sdks/java/io/iceberg/hive/build.gradle new file mode 100644 index 0000000000000..b81867ec90ca3 --- /dev/null +++ b/sdks/java/io/iceberg/hive/build.gradle @@ -0,0 +1,80 @@ +import groovy.json.JsonOutput + +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +plugins { id 'org.apache.beam.module' } +applyJavaNature( + automaticModuleName: 'org.apache.beam.sdk.io.iceberg.hive', + exportJavadoc: false, + shadowClosure: {}, +) + +description = "Apache Beam :: SDKs :: Java :: IO :: Iceberg :: Hive" +ext.summary = "Runtime dependencies needed for Hive catalog integration." + +def hive_version = "3.1.3" +def iceberg_version = "1.4.2" + +dependencies { + // dependencies needed to run with iceberg's hive catalog + runtimeOnly ("org.apache.iceberg:iceberg-hive-metastore:$iceberg_version") + runtimeOnly project(path: ":sdks:java:io:iceberg:hive:exec", configuration: "shadow") + runtimeOnly library.java.bigdataoss_gcs_connector + runtimeOnly library.java.hadoop_client + + // ----- below dependencies are for testing and will not appear in the shaded jar ----- + // Beam IcebergIO dependencies + testImplementation project(path: ":sdks:java:core", configuration: "shadow") + testImplementation project(":sdks:java:managed") + testImplementation project(":sdks:java:io:iceberg") + testRuntimeOnly project(path: ":runners:direct-java", configuration: "shadow") + testRuntimeOnly library.java.snake_yaml + + // needed to set up the test environment + testImplementation "org.apache.iceberg:iceberg-common:$iceberg_version" + testImplementation "org.apache.iceberg:iceberg-core:$iceberg_version" + testImplementation "org.assertj:assertj-core:3.11.1" + testImplementation library.java.junit + + // needed to set up test Hive Metastore and run tests + testImplementation ("org.apache.iceberg:iceberg-hive-metastore:$iceberg_version") + testImplementation project(path: ":sdks:java:io:iceberg:hive:exec", configuration: "shadow") + testRuntimeOnly ("org.apache.hive.hcatalog:hive-hcatalog-core:$hive_version") { + exclude group: "org.apache.hive", module: "hive-exec" + exclude group: "org.apache.parquet", module: "parquet-hadoop-bundle" + } + testImplementation "org.apache.iceberg:iceberg-parquet:$iceberg_version" + testImplementation "org.apache.parquet:parquet-column:1.12.0" +} + +task integrationTest(type: Test) { + group = "Verification" + def gcpTempLocation = project.findProperty('gcpTempLocation') ?: 'gs://temp-storage-for-end-to-end-tests/iceberg-hive-it' + systemProperty "beamTestPipelineOptions", JsonOutput.toJson([ + "--tempLocation=${gcpTempLocation}", + ]) + + // Disable Gradle cache: these ITs interact with live service that should always be considered "out of date" + outputs.upToDateWhen { false } + + include '**/*IT.class' + + maxParallelForks 4 + classpath = sourceSets.test.runtimeClasspath + testClassesDirs = sourceSets.test.output.classesDirs +} \ No newline at end of file diff --git a/sdks/java/io/iceberg/hive/exec/build.gradle b/sdks/java/io/iceberg/hive/exec/build.gradle new file mode 100644 index 0000000000000..581f71ddedd1f --- /dev/null +++ b/sdks/java/io/iceberg/hive/exec/build.gradle @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +plugins { + id 'org.apache.beam.module' + id 'java' + id 'com.github.johnrengelman.shadow' +} + +dependencies { + implementation("org.apache.hive:hive-exec:3.1.3") + permitUnusedDeclared("org.apache.hive:hive-exec:3.1.3") +} + +configurations { + shadow +} + +artifacts { + shadow(archives(shadowJar) { + builtBy shadowJar + }) +} + +shadowJar { + zip64 true + + // need to shade "com.google.guava" to avoid Guava conflict + relocate 'com.google.protobuf', getJavaRelocatedPath('com.google.protobuf') + relocate 'shaded.parquet', getJavaRelocatedPath('shaded.parquet') + relocate 'org.apache.parquet', getJavaRelocatedPath('org.apache.parquet') + + version "3.1.3" + mergeServiceFiles() + + exclude 'LICENSE' + exclude( + 'org/xml/**', + 'javax/**', + 'com/sun/**' + ) +} +description = "Apache Beam :: SDKs :: Java :: IO :: Iceberg :: Hive :: Exec" +ext.summary = "A copy of the hive-exec dependency with some popular libraries relocated." diff --git a/sdks/java/io/iceberg/hive/src/test/java/org/apache/beam/sdk/io/iceberg/hive/IcebergHiveCatalogIT.java b/sdks/java/io/iceberg/hive/src/test/java/org/apache/beam/sdk/io/iceberg/hive/IcebergHiveCatalogIT.java new file mode 100644 index 0000000000000..54a4998d37fba --- /dev/null +++ b/sdks/java/io/iceberg/hive/src/test/java/org/apache/beam/sdk/io/iceberg/hive/IcebergHiveCatalogIT.java @@ -0,0 +1,280 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.iceberg.hive; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.containsInAnyOrder; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.UUID; +import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; +import java.util.stream.LongStream; +import org.apache.beam.sdk.io.iceberg.IcebergUtils; +import org.apache.beam.sdk.io.iceberg.hive.testutils.HiveMetastoreExtension; +import org.apache.beam.sdk.managed.Managed; +import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.testing.PAssert; +import org.apache.beam.sdk.testing.TestPipeline; +import org.apache.beam.sdk.transforms.Create; +import org.apache.beam.sdk.transforms.SimpleFunction; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.Row; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Maps; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.api.Database; +import org.apache.iceberg.AppendFiles; +import org.apache.iceberg.CatalogProperties; +import org.apache.iceberg.CatalogUtil; +import org.apache.iceberg.CombinedScanTask; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.ManifestFiles; +import org.apache.iceberg.ManifestWriter; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableScan; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.data.parquet.GenericParquetReaders; +import org.apache.iceberg.data.parquet.GenericParquetWriter; +import org.apache.iceberg.encryption.InputFilesDecryptor; +import org.apache.iceberg.hive.HiveCatalog; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.DataWriter; +import org.apache.iceberg.io.InputFile; +import org.apache.iceberg.io.OutputFile; +import org.apache.iceberg.parquet.Parquet; +import org.apache.thrift.TException; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Rule; +import org.junit.Test; + +/** + * Read and write test for {@link Managed} {@link org.apache.beam.sdk.io.iceberg.IcebergIO} using + * {@link HiveCatalog}. + * + *

    Spins up a local Hive metastore to manage the Iceberg table. Warehouse path is set to a GCS + * bucket. + */ +public class IcebergHiveCatalogIT { + private static final Schema DOUBLY_NESTED_ROW_SCHEMA = + Schema.builder() + .addStringField("doubly_nested_str") + .addInt64Field("doubly_nested_float") + .build(); + + private static final Schema NESTED_ROW_SCHEMA = + Schema.builder() + .addStringField("nested_str") + .addInt32Field("nested_int") + .addFloatField("nested_float") + .addRowField("nested_row", DOUBLY_NESTED_ROW_SCHEMA) + .build(); + private static final Schema BEAM_SCHEMA = + Schema.builder() + .addStringField("str") + .addBooleanField("bool") + .addNullableInt32Field("nullable_int") + .addNullableInt64Field("nullable_long") + .addArrayField("arr_long", Schema.FieldType.INT64) + .addRowField("row", NESTED_ROW_SCHEMA) + .addNullableRowField("nullable_row", NESTED_ROW_SCHEMA) + .build(); + + private static final SimpleFunction ROW_FUNC = + new SimpleFunction() { + @Override + public Row apply(Long num) { + String strNum = Long.toString(num); + Row nestedRow = + Row.withSchema(NESTED_ROW_SCHEMA) + .addValue("nested_str_value_" + strNum) + .addValue(Integer.valueOf(strNum)) + .addValue(Float.valueOf(strNum + "." + strNum)) + .addValue( + Row.withSchema(DOUBLY_NESTED_ROW_SCHEMA) + .addValue("doubly_nested_str_value_" + strNum) + .addValue(num) + .build()) + .build(); + + return Row.withSchema(BEAM_SCHEMA) + .addValue("str_value_" + strNum) + .addValue(num % 2 == 0) + .addValue(Integer.valueOf(strNum)) + .addValue(num) + .addValue(LongStream.range(1, num % 10).boxed().collect(Collectors.toList())) + .addValue(nestedRow) + .addValue(num % 2 == 0 ? null : nestedRow) + .build(); + } + }; + + private static final org.apache.iceberg.Schema ICEBERG_SCHEMA = + IcebergUtils.beamSchemaToIcebergSchema(BEAM_SCHEMA); + private static final SimpleFunction RECORD_FUNC = + new SimpleFunction() { + @Override + public Record apply(Row input) { + return IcebergUtils.beamRowToIcebergRecord(ICEBERG_SCHEMA, input); + } + }; + + private static HiveMetastoreExtension hiveMetastoreExtension; + + @Rule public TestPipeline writePipeline = TestPipeline.create(); + + @Rule public TestPipeline readPipeline = TestPipeline.create(); + + private static final String TEST_CATALOG = "test_catalog"; + private static final String TEST_TABLE = "test_table"; + private static HiveCatalog catalog; + private static final String TEST_DB = "test_db_" + System.nanoTime(); + + @BeforeClass + public static void setUp() throws TException { + String warehousePath = TestPipeline.testingPipelineOptions().getTempLocation(); + hiveMetastoreExtension = new HiveMetastoreExtension(warehousePath); + catalog = + (HiveCatalog) + CatalogUtil.loadCatalog( + HiveCatalog.class.getName(), + TEST_CATALOG, + ImmutableMap.of( + CatalogProperties.CLIENT_POOL_CACHE_EVICTION_INTERVAL_MS, + String.valueOf(TimeUnit.SECONDS.toMillis(10))), + hiveMetastoreExtension.hiveConf()); + + String dbPath = hiveMetastoreExtension.metastore().getDatabasePath(TEST_DB); + Database db = new Database(TEST_DB, "description", dbPath, Maps.newHashMap()); + hiveMetastoreExtension.metastoreClient().createDatabase(db); + } + + @AfterClass + public static void cleanup() throws Exception { + hiveMetastoreExtension.cleanup(); + } + + private Map getManagedIcebergConfig(TableIdentifier table) { + String metastoreUri = hiveMetastoreExtension.hiveConf().getVar(HiveConf.ConfVars.METASTOREURIS); + + Map confProperties = + ImmutableMap.builder() + .put(HiveConf.ConfVars.METASTOREURIS.varname, metastoreUri) + .build(); + + return ImmutableMap.builder() + .put("table", table.toString()) + .put("config_properties", confProperties) + .build(); + } + + @Test + public void testReadWithHiveCatalog() throws IOException { + TableIdentifier tableIdentifier = + TableIdentifier.parse(String.format("%s.%s", TEST_DB, TEST_TABLE + "_read_test")); + Table table = catalog.createTable(tableIdentifier, ICEBERG_SCHEMA); + + List expectedRows = + LongStream.range(1, 1000).boxed().map(ROW_FUNC::apply).collect(Collectors.toList()); + List records = + expectedRows.stream().map(RECORD_FUNC::apply).collect(Collectors.toList()); + + // write iceberg records with hive catalog + String filepath = table.location() + "/" + UUID.randomUUID(); + DataWriter writer = + Parquet.writeData(table.io().newOutputFile(filepath)) + .schema(ICEBERG_SCHEMA) + .createWriterFunc(GenericParquetWriter::buildWriter) + .overwrite() + .withSpec(table.spec()) + .build(); + for (Record rec : records) { + writer.write(rec); + } + writer.close(); + AppendFiles appendFiles = table.newAppend(); + String manifestFilename = FileFormat.AVRO.addExtension(filepath + ".manifest"); + OutputFile outputFile = table.io().newOutputFile(manifestFilename); + ManifestWriter manifestWriter; + try (ManifestWriter openWriter = ManifestFiles.write(table.spec(), outputFile)) { + openWriter.add(writer.toDataFile()); + manifestWriter = openWriter; + } + appendFiles.appendManifest(manifestWriter.toManifestFile()); + appendFiles.commit(); + + // Run Managed Iceberg read + PCollection outputRows = + readPipeline + .apply( + Managed.read(Managed.ICEBERG).withConfig(getManagedIcebergConfig(tableIdentifier))) + .getSinglePCollection(); + PAssert.that(outputRows).containsInAnyOrder(expectedRows); + readPipeline.run().waitUntilFinish(); + } + + @Test + public void testWriteWithHiveCatalog() { + TableIdentifier tableIdentifier = + TableIdentifier.parse(String.format("%s.%s", TEST_DB, TEST_TABLE + "_write_test")); + catalog.createTable(tableIdentifier, IcebergUtils.beamSchemaToIcebergSchema(BEAM_SCHEMA)); + + List inputRows = + LongStream.range(1, 1000).mapToObj(ROW_FUNC::apply).collect(Collectors.toList()); + List expectedRecords = + inputRows.stream().map(RECORD_FUNC::apply).collect(Collectors.toList()); + + // Run Managed Iceberg write + writePipeline + .apply(Create.of(inputRows)) + .setRowSchema(BEAM_SCHEMA) + .apply(Managed.write(Managed.ICEBERG).withConfig(getManagedIcebergConfig(tableIdentifier))); + writePipeline.run().waitUntilFinish(); + + // read back the records and check everything's there + Table table = catalog.loadTable(tableIdentifier); + TableScan tableScan = table.newScan().project(ICEBERG_SCHEMA); + List writtenRecords = new ArrayList<>(); + for (CombinedScanTask task : tableScan.planTasks()) { + InputFilesDecryptor decryptor = new InputFilesDecryptor(task, table.io(), table.encryption()); + for (FileScanTask fileTask : task.files()) { + InputFile inputFile = decryptor.getInputFile(fileTask); + CloseableIterable iterable = + Parquet.read(inputFile) + .split(fileTask.start(), fileTask.length()) + .project(ICEBERG_SCHEMA) + .createReaderFunc( + fileSchema -> GenericParquetReaders.buildReader(ICEBERG_SCHEMA, fileSchema)) + .filter(fileTask.residual()) + .build(); + + for (Record rec : iterable) { + writtenRecords.add(rec); + } + } + } + assertThat(expectedRecords, containsInAnyOrder(writtenRecords.toArray())); + } +} diff --git a/sdks/java/io/iceberg/hive/src/test/java/org/apache/beam/sdk/io/iceberg/hive/testutils/HiveMetastoreExtension.java b/sdks/java/io/iceberg/hive/src/test/java/org/apache/beam/sdk/io/iceberg/hive/testutils/HiveMetastoreExtension.java new file mode 100644 index 0000000000000..52de1b91a216a --- /dev/null +++ b/sdks/java/io/iceberg/hive/src/test/java/org/apache/beam/sdk/io/iceberg/hive/testutils/HiveMetastoreExtension.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.iceberg.hive.testutils; + +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.HiveMetaStoreClient; +import org.apache.hadoop.hive.metastore.api.MetaException; + +/** + * A class that interacts with {@link TestHiveMetastore}. + * + *

    Trimmed down from Iceberg's + * integration testing util + */ +public class HiveMetastoreExtension { + private HiveMetaStoreClient metastoreClient; + private TestHiveMetastore metastore; + + public HiveMetastoreExtension(String warehousePath) throws MetaException { + metastore = new TestHiveMetastore(warehousePath); + HiveConf hiveConf = new HiveConf(TestHiveMetastore.class); + + metastore.start(hiveConf); + metastoreClient = new HiveMetaStoreClient(hiveConf); + } + + public void cleanup() throws Exception { + if (metastoreClient != null) { + metastoreClient.close(); + } + + if (metastore != null) { + metastore.reset(); + metastore.stop(); + } + + metastoreClient = null; + metastore = null; + } + + public HiveMetaStoreClient metastoreClient() { + return metastoreClient; + } + + public HiveConf hiveConf() { + return metastore.hiveConf(); + } + + public TestHiveMetastore metastore() { + return metastore; + } +} diff --git a/sdks/java/io/iceberg/hive/src/test/java/org/apache/beam/sdk/io/iceberg/hive/testutils/ScriptRunner.java b/sdks/java/io/iceberg/hive/src/test/java/org/apache/beam/sdk/io/iceberg/hive/testutils/ScriptRunner.java new file mode 100644 index 0000000000000..adf941e00b4b6 --- /dev/null +++ b/sdks/java/io/iceberg/hive/src/test/java/org/apache/beam/sdk/io/iceberg/hive/testutils/ScriptRunner.java @@ -0,0 +1,203 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.iceberg.hive.testutils; + +import java.io.IOException; +import java.io.LineNumberReader; +import java.io.PrintWriter; +import java.io.Reader; +import java.sql.Connection; +import java.sql.ResultSet; +import java.sql.ResultSetMetaData; +import java.sql.SQLException; +import java.sql.Statement; + +/** + * Tool to run database scripts. + * + *

    Copied over from Iceberg's + * integration testing + */ +@SuppressWarnings({"OperatorPrecedence", "DefaultCharset"}) +public class ScriptRunner { + + private static final String DEFAULT_DELIMITER = ";"; + + private final Connection connection; + + private final boolean stopOnError; + private final boolean autoCommit; + + private final PrintWriter logWriter = new PrintWriter(System.out); + private final PrintWriter errorLogWriter = new PrintWriter(System.err); + + /** Default constructor. */ + public ScriptRunner(Connection connection, boolean autoCommit, boolean stopOnError) { + this.connection = connection; + this.autoCommit = autoCommit; + this.stopOnError = stopOnError; + } + + /** + * Runs an SQL script (read in using the Reader parameter). + * + * @param reader - the source of the script + */ + public void runScript(Reader reader) throws IOException, SQLException { + try { + boolean originalAutoCommit = connection.getAutoCommit(); + try { + if (originalAutoCommit != this.autoCommit) { + connection.setAutoCommit(this.autoCommit); + } + runScript(connection, reader); + } finally { + connection.setAutoCommit(originalAutoCommit); + } + } catch (IOException | SQLException e) { + throw e; + } catch (Exception e) { + throw new RuntimeException("Error running script. Cause: " + e, e); + } + } + + /** + * Runs an SQL script (read in using the Reader parameter) using the connection passed in. + * + * @param conn - the connection to use for the script + * @param reader - the source of the script + * @throws SQLException if any SQL errors occur + * @throws IOException if there is an error reading from the Reader + */ + @SuppressWarnings("checkstyle:CyclomaticComplexity") + private void runScript(Connection conn, Reader reader) throws IOException, SQLException { + StringBuilder command = null; + try { + LineNumberReader lineReader = new LineNumberReader(reader); + String line; + while ((line = lineReader.readLine()) != null) { + if (command == null) { + command = new StringBuilder(); + } + String trimmedLine = line.trim(); + boolean fullLineDelimiter = false; + if (trimmedLine.startsWith("--")) { + println(trimmedLine); + } else if (trimmedLine.isEmpty() || trimmedLine.startsWith("//")) { + // Do nothing + } else if (!fullLineDelimiter && trimmedLine.endsWith(getDelimiter()) + || fullLineDelimiter && trimmedLine.equals(getDelimiter())) { + command.append(line, 0, line.lastIndexOf(getDelimiter())); + command.append(" "); + Statement statement = conn.createStatement(); + + println(command); + + boolean hasResults = false; + if (stopOnError) { + hasResults = statement.execute(command.toString()); + } else { + try { + statement.execute(command.toString()); + } catch (SQLException e) { + e.fillInStackTrace(); + printlnError("Error executing: " + command); + printlnError(e); + } + } + + if (autoCommit && !conn.getAutoCommit()) { + conn.commit(); + } + + ResultSet rs = statement.getResultSet(); + if (hasResults && rs != null) { + ResultSetMetaData md = rs.getMetaData(); + int cols = md.getColumnCount(); + for (int i = 0; i < cols; i++) { + String name = md.getColumnLabel(i); + print(name + "\t"); + } + println(""); + while (rs.next()) { + for (int i = 0; i < cols; i++) { + String value = rs.getString(i); + print(value + "\t"); + } + println(""); + } + } + + command = null; + try { + statement.close(); + } catch (Exception e) { + // Ignore to workaround a bug in Jakarta DBCP + } + Thread.yield(); + } else { + command.append(line); + command.append(" "); + } + } + if (!autoCommit) { + conn.commit(); + } + } catch (IOException | SQLException e) { + e.fillInStackTrace(); + printlnError("Error executing: " + command); + printlnError(e); + throw e; + } finally { + conn.rollback(); + flush(); + } + } + + private String getDelimiter() { + return DEFAULT_DELIMITER; + } + + private void print(Object obj) { + if (logWriter != null) { + System.out.print(obj); + } + } + + private void println(Object obj) { + if (logWriter != null) { + logWriter.println(obj); + } + } + + private void printlnError(Object obj) { + if (errorLogWriter != null) { + errorLogWriter.println(obj); + } + } + + private void flush() { + if (logWriter != null) { + logWriter.flush(); + } + if (errorLogWriter != null) { + errorLogWriter.flush(); + } + } +} diff --git a/sdks/java/io/iceberg/hive/src/test/java/org/apache/beam/sdk/io/iceberg/hive/testutils/TestHiveMetastore.java b/sdks/java/io/iceberg/hive/src/test/java/org/apache/beam/sdk/io/iceberg/hive/testutils/TestHiveMetastore.java new file mode 100644 index 0000000000000..e3af43d58c65f --- /dev/null +++ b/sdks/java/io/iceberg/hive/src/test/java/org/apache/beam/sdk/io/iceberg/hive/testutils/TestHiveMetastore.java @@ -0,0 +1,273 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.iceberg.hive.testutils; + +import static java.nio.file.Files.createTempDirectory; +import static java.nio.file.attribute.PosixFilePermissions.asFileAttribute; +import static java.nio.file.attribute.PosixFilePermissions.fromString; +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.nio.charset.StandardCharsets; +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.SQLException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.HiveMetaStore; +import org.apache.hadoop.hive.metastore.IHMSHandler; +import org.apache.hadoop.hive.metastore.RetryingHMSHandler; +import org.apache.hadoop.hive.metastore.TSetIpAddressProcessor; +import org.apache.iceberg.common.DynConstructors; +import org.apache.iceberg.common.DynMethods; +import org.apache.iceberg.hadoop.Util; +import org.apache.iceberg.hive.HiveClientPool; +import org.apache.thrift.protocol.TBinaryProtocol; +import org.apache.thrift.server.TServer; +import org.apache.thrift.server.TThreadPoolServer; +import org.apache.thrift.transport.TServerSocket; +import org.apache.thrift.transport.TTransportFactory; + +/** + * A Hive Metastore implementation for local testing. Not meant to be used directly. Use {@link + * HiveMetastoreExtension} instead. + * + *

    Copied over from Iceberg's + * integration testing util + */ +public class TestHiveMetastore { + + private static final String DEFAULT_DATABASE_NAME = "default"; + private static final int DEFAULT_POOL_SIZE = 5; + + // create the metastore handlers based on whether we're working with Hive2 or Hive3 dependencies + // we need to do this because there is a breaking API change between Hive2 and Hive3 + private static final DynConstructors.Ctor HMS_HANDLER_CTOR = + DynConstructors.builder() + .impl(HiveMetaStore.HMSHandler.class, String.class, Configuration.class) + .impl(HiveMetaStore.HMSHandler.class, String.class, HiveConf.class) + .build(); + + private static final DynMethods.StaticMethod GET_BASE_HMS_HANDLER = + DynMethods.builder("getProxy") + .impl(RetryingHMSHandler.class, Configuration.class, IHMSHandler.class, boolean.class) + .impl(RetryingHMSHandler.class, HiveConf.class, IHMSHandler.class, boolean.class) + .buildStatic(); + + // Hive3 introduces background metastore tasks (MetastoreTaskThread) for performing various + // cleanup duties. These + // threads are scheduled and executed in a static thread pool + // (org.apache.hadoop.hive.metastore.ThreadPool). + // This thread pool is shut down normally as part of the JVM shutdown hook, but since we're + // creating and tearing down + // multiple metastore instances within the same JVM, we have to call this cleanup method manually, + // otherwise + // threads from our previous test suite will be stuck in the pool with stale config, and keep on + // being scheduled. + // This can lead to issues, e.g. accidental Persistence Manager closure by + // ScheduledQueryExecutionsMaintTask. + private static final DynMethods.StaticMethod METASTORE_THREADS_SHUTDOWN = + DynMethods.builder("shutdown") + .impl("org.apache.hadoop.hive.metastore.ThreadPool") + .orNoop() + .buildStatic(); + + // It's tricky to clear all static fields in an HMS instance in order to switch derby root dir. + // Therefore, we reuse the same derby root between tests and remove it after JVM exits. + private static final File HIVE_LOCAL_DIR; + private static final String DERBY_PATH; + + static { + try { + HIVE_LOCAL_DIR = + createTempDirectory("hive", asFileAttribute(fromString("rwxrwxrwx"))).toFile(); + DERBY_PATH = HIVE_LOCAL_DIR + "/metastore_db"; + File derbyLogFile = new File(HIVE_LOCAL_DIR, "derby.log"); + System.setProperty("derby.stream.error.file", derbyLogFile.getAbsolutePath()); + setupMetastoreDB("jdbc:derby:" + DERBY_PATH + ";create=true"); + Runtime.getRuntime() + .addShutdownHook( + new Thread( + () -> { + Path localDirPath = new Path(HIVE_LOCAL_DIR.getAbsolutePath()); + FileSystem fs = Util.getFs(localDirPath, new Configuration()); + String errMsg = "Failed to delete " + localDirPath; + try { + assertThat(fs.delete(localDirPath, true)).as(errMsg).isTrue(); + } catch (IOException e) { + throw new RuntimeException(errMsg, e); + } + })); + } catch (Exception e) { + throw new RuntimeException("Failed to setup local dir for hive metastore", e); + } + } + + private HiveConf hiveConf; + private ExecutorService executorService; + private TServer server; + private HiveMetaStore.HMSHandler baseHandler; + private HiveClientPool clientPool; + private final String hiveWarehousePath; + + TestHiveMetastore(String hiveWarehousePath) { + this.hiveWarehousePath = hiveWarehousePath; + } + + /** + * Starts a TestHiveMetastore with the default connection pool size (5) with the provided + * HiveConf. + * + * @param conf The hive configuration to use + */ + public void start(HiveConf conf) { + start(conf, DEFAULT_POOL_SIZE); + } + + /** + * Starts a TestHiveMetastore with a provided connection pool size and HiveConf. + * + * @param conf The hive configuration to use + * @param poolSize The number of threads in the executor pool + */ + @SuppressWarnings("FutureReturnValueIgnored") + public void start(HiveConf conf, int poolSize) { + try { + TServerSocket socket = new TServerSocket(0); + int port = socket.getServerSocket().getLocalPort(); + initConf(conf, port); + + this.hiveConf = conf; + this.server = newThriftServer(socket, poolSize, hiveConf); + this.executorService = Executors.newSingleThreadExecutor(); + this.executorService.submit(() -> server.serve()); + this.clientPool = new HiveClientPool(1, hiveConf); + } catch (Exception e) { + throw new RuntimeException("Cannot start TestHiveMetastore", e); + } + } + + public void stop() throws Exception { + reset(); + if (clientPool != null) { + clientPool.close(); + } + if (server != null) { + server.stop(); + } + if (executorService != null) { + executorService.shutdown(); + } + if (baseHandler != null) { + baseHandler.shutdown(); + } + METASTORE_THREADS_SHUTDOWN.invoke(); + } + + public HiveConf hiveConf() { + return hiveConf; + } + + public String getDatabasePath(String dbName) { + return hiveWarehousePath + "/" + dbName + ".db"; + } + + public void reset() throws Exception { + if (clientPool != null) { + for (String dbName : clientPool.run(client -> client.getAllDatabases())) { + for (String tblName : clientPool.run(client -> client.getAllTables(dbName))) { + clientPool.run( + client -> { + client.dropTable(dbName, tblName, true, true, true); + return null; + }); + } + + if (!DEFAULT_DATABASE_NAME.equals(dbName)) { + // Drop cascade, functions dropped by cascade + clientPool.run( + client -> { + client.dropDatabase(dbName, true, true, true); + return null; + }); + } + } + } + + Path warehouseRoot = new Path(hiveWarehousePath); + FileSystem fs = Util.getFs(warehouseRoot, hiveConf); + for (FileStatus fileStatus : fs.listStatus(warehouseRoot)) { + if (!fileStatus.getPath().getName().equals("derby.log") + && !fileStatus.getPath().getName().equals("metastore_db")) { + fs.delete(fileStatus.getPath(), true); + } + } + } + + private TServer newThriftServer(TServerSocket socket, int poolSize, HiveConf conf) + throws Exception { + HiveConf serverConf = new HiveConf(conf); + serverConf.set( + HiveConf.ConfVars.METASTORECONNECTURLKEY.varname, + "jdbc:derby:" + DERBY_PATH + ";create=true"); + baseHandler = HMS_HANDLER_CTOR.newInstance("new db based metaserver", serverConf); + IHMSHandler handler = GET_BASE_HMS_HANDLER.invoke(serverConf, baseHandler, false); + + TThreadPoolServer.Args args = + new TThreadPoolServer.Args(socket) + .processor(new TSetIpAddressProcessor<>(handler)) + .transportFactory(new TTransportFactory()) + .protocolFactory(new TBinaryProtocol.Factory()) + .minWorkerThreads(poolSize) + .maxWorkerThreads(poolSize); + + return new TThreadPoolServer(args); + } + + private void initConf(HiveConf conf, int port) { + conf.set(HiveConf.ConfVars.METASTOREURIS.varname, "thrift://localhost:" + port); + conf.set(HiveConf.ConfVars.METASTOREWAREHOUSE.varname, hiveWarehousePath); + conf.set(HiveConf.ConfVars.METASTORE_TRY_DIRECT_SQL.varname, "false"); + conf.set(HiveConf.ConfVars.METASTORE_DISALLOW_INCOMPATIBLE_COL_TYPE_CHANGES.varname, "false"); + conf.set("iceberg.hive.client-pool-size", "2"); + // Setting this to avoid thrift exception during running Iceberg tests outside Iceberg. + conf.set( + HiveConf.ConfVars.HIVE_IN_TEST.varname, HiveConf.ConfVars.HIVE_IN_TEST.getDefaultValue()); + } + + private static void setupMetastoreDB(String dbURL) throws SQLException, IOException { + Connection connection = DriverManager.getConnection(dbURL); + ScriptRunner scriptRunner = new ScriptRunner(connection, true, true); + + ClassLoader classLoader = ClassLoader.getSystemClassLoader(); + InputStream inputStream = classLoader.getResourceAsStream("hive-schema-3.1.0.derby.sql"); + try (Reader reader = new InputStreamReader(inputStream, StandardCharsets.UTF_8)) { + scriptRunner.runScript(reader); + } + } +} diff --git a/sdks/java/io/iceberg/hive/src/test/resources/hive-schema-3.1.0.derby.sql b/sdks/java/io/iceberg/hive/src/test/resources/hive-schema-3.1.0.derby.sql new file mode 100644 index 0000000000000..808c605857648 --- /dev/null +++ b/sdks/java/io/iceberg/hive/src/test/resources/hive-schema-3.1.0.derby.sql @@ -0,0 +1,267 @@ +-- +-- Licensed to the Apache Software Foundation (ASF) under one or more +-- contributor license agreements. See the NOTICE file distributed with +-- this work for additional information regarding copyright ownership. +-- The ASF licenses this file to You under the Apache License, Version 2.0 +-- (the "License"); you may not use this file except in compliance with +-- the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. +-- +-- This file was copied from Apache Hive, at: +-- https://github.com/apache/hive/blob/master/standalone-metastore/metastore-server/src/main/sql/derby/hive-schema-3.1.0.derby.sql +-- +-- This has been modified slightly for compatibility with older Hive versions. +-- +-- Timestamp: 2011-09-22 15:32:02.024 +-- Source database is: /home/carl/Work/repos/hive1/metastore/scripts/upgrade/derby/mdb +-- Connection URL is: jdbc:derby:/home/carl/Work/repos/hive1/metastore/scripts/upgrade/derby/mdb +-- Specified schema is: APP +-- appendLogs: false + +-- ---------------------------------------------- +-- DDL Statements for functions +-- ---------------------------------------------- + +CREATE FUNCTION "APP"."NUCLEUS_ASCII" (C CHAR(1)) RETURNS INTEGER LANGUAGE JAVA PARAMETER STYLE JAVA READS SQL DATA CALLED ON NULL INPUT EXTERNAL NAME 'org.datanucleus.store.rdbms.adapter.DerbySQLFunction.ascii' ; + +CREATE FUNCTION "APP"."NUCLEUS_MATCHES" (TEXT VARCHAR(8000),PATTERN VARCHAR(8000)) RETURNS INTEGER LANGUAGE JAVA PARAMETER STYLE JAVA READS SQL DATA CALLED ON NULL INPUT EXTERNAL NAME 'org.datanucleus.store.rdbms.adapter.DerbySQLFunction.matches' ; + +-- ---------------------------------------------- +-- DDL Statements for tables +-- ---------------------------------------------- +CREATE TABLE "APP"."DBS" ( + "DB_ID" BIGINT NOT NULL, + "DESC" VARCHAR(4000), + "DB_LOCATION_URI" VARCHAR(4000) NOT NULL, + "NAME" VARCHAR(128), + "OWNER_NAME" VARCHAR(128), + "OWNER_TYPE" VARCHAR(10), + "CTLG_NAME" VARCHAR(256) +); + +CREATE TABLE "APP"."DATABASE_PARAMS" ("DB_ID" BIGINT NOT NULL, "PARAM_KEY" VARCHAR(180) NOT NULL, "PARAM_VALUE" VARCHAR(4000)); + +CREATE TABLE "APP"."SERDE_PARAMS" ("SERDE_ID" BIGINT NOT NULL, "PARAM_KEY" VARCHAR(256) NOT NULL, "PARAM_VALUE" CLOB); + +CREATE TABLE "APP"."COLUMNS_V2" ("CD_ID" BIGINT NOT NULL, "COMMENT" VARCHAR(4000), "COLUMN_NAME" VARCHAR(767) NOT NULL, "TYPE_NAME" CLOB, "INTEGER_IDX" INTEGER NOT NULL); + +CREATE TABLE "APP"."SORT_COLS" ("SD_ID" BIGINT NOT NULL, "COLUMN_NAME" VARCHAR(767), "ORDER" INTEGER NOT NULL, "INTEGER_IDX" INTEGER NOT NULL); + +CREATE TABLE "APP"."CDS" ("CD_ID" BIGINT NOT NULL); + +CREATE TABLE "APP"."SERDES" ("SERDE_ID" BIGINT NOT NULL, "NAME" VARCHAR(128), "SLIB" VARCHAR(4000), "DESCRIPTION" VARCHAR(4000), "SERIALIZER_CLASS" VARCHAR(4000), "DESERIALIZER_CLASS" VARCHAR(4000), SERDE_TYPE INTEGER); + +CREATE TABLE "APP"."ROLE_MAP" ("ROLE_GRANT_ID" BIGINT NOT NULL, "ADD_TIME" INTEGER NOT NULL, "GRANT_OPTION" SMALLINT NOT NULL, "GRANTOR" VARCHAR(128), "GRANTOR_TYPE" VARCHAR(128), "PRINCIPAL_NAME" VARCHAR(128), "PRINCIPAL_TYPE" VARCHAR(128), "ROLE_ID" BIGINT); + +CREATE TABLE "APP"."GLOBAL_PRIVS" ("USER_GRANT_ID" BIGINT NOT NULL, "CREATE_TIME" INTEGER NOT NULL, "GRANT_OPTION" SMALLINT NOT NULL, "GRANTOR" VARCHAR(128), "GRANTOR_TYPE" VARCHAR(128), "PRINCIPAL_NAME" VARCHAR(128), "PRINCIPAL_TYPE" VARCHAR(128), "USER_PRIV" VARCHAR(128), "AUTHORIZER" VARCHAR(128)); + +CREATE TABLE "APP"."ROLES" ("ROLE_ID" BIGINT NOT NULL, "CREATE_TIME" INTEGER NOT NULL, "OWNER_NAME" VARCHAR(128), "ROLE_NAME" VARCHAR(128)); + +CREATE TABLE "APP"."TBLS" ("TBL_ID" BIGINT NOT NULL, "CREATE_TIME" INTEGER NOT NULL, "DB_ID" BIGINT, "LAST_ACCESS_TIME" INTEGER NOT NULL, "OWNER" VARCHAR(767), "OWNER_TYPE" VARCHAR(10), "RETENTION" INTEGER NOT NULL, "SD_ID" BIGINT, "TBL_NAME" VARCHAR(256), "TBL_TYPE" VARCHAR(128), "VIEW_EXPANDED_TEXT" LONG VARCHAR, "VIEW_ORIGINAL_TEXT" LONG VARCHAR, "IS_REWRITE_ENABLED" CHAR(1) NOT NULL DEFAULT 'N'); + +CREATE TABLE "APP"."PARTITION_KEYS" ("TBL_ID" BIGINT NOT NULL, "PKEY_COMMENT" VARCHAR(4000), "PKEY_NAME" VARCHAR(128) NOT NULL, "PKEY_TYPE" VARCHAR(767) NOT NULL, "INTEGER_IDX" INTEGER NOT NULL); + +CREATE TABLE "APP"."SDS" ("SD_ID" BIGINT NOT NULL, "INPUT_FORMAT" VARCHAR(4000), "IS_COMPRESSED" CHAR(1) NOT NULL, "LOCATION" VARCHAR(4000), "NUM_BUCKETS" INTEGER NOT NULL, "OUTPUT_FORMAT" VARCHAR(4000), "SERDE_ID" BIGINT, "CD_ID" BIGINT, "IS_STOREDASSUBDIRECTORIES" CHAR(1) NOT NULL); + +CREATE TABLE "APP"."SEQUENCE_TABLE" ("SEQUENCE_NAME" VARCHAR(256) NOT NULL, "NEXT_VAL" BIGINT NOT NULL); + +CREATE TABLE "APP"."TABLE_PARAMS" ("TBL_ID" BIGINT NOT NULL, "PARAM_KEY" VARCHAR(256) NOT NULL, "PARAM_VALUE" CLOB); + +CREATE TABLE "APP"."BUCKETING_COLS" ("SD_ID" BIGINT NOT NULL, "BUCKET_COL_NAME" VARCHAR(256), "INTEGER_IDX" INTEGER NOT NULL); + +CREATE TABLE "APP"."SD_PARAMS" ("SD_ID" BIGINT NOT NULL, "PARAM_KEY" VARCHAR(256) NOT NULL, "PARAM_VALUE" CLOB); + +CREATE TABLE "APP"."SKEWED_STRING_LIST" ("STRING_LIST_ID" BIGINT NOT NULL); + +CREATE TABLE "APP"."SKEWED_STRING_LIST_VALUES" ("STRING_LIST_ID" BIGINT NOT NULL, "STRING_LIST_VALUE" VARCHAR(256), "INTEGER_IDX" INTEGER NOT NULL); + +CREATE TABLE "APP"."SKEWED_COL_NAMES" ("SD_ID" BIGINT NOT NULL, "SKEWED_COL_NAME" VARCHAR(256), "INTEGER_IDX" INTEGER NOT NULL); + +CREATE TABLE "APP"."SKEWED_COL_VALUE_LOC_MAP" ("SD_ID" BIGINT NOT NULL, "STRING_LIST_ID_KID" BIGINT NOT NULL, "LOCATION" VARCHAR(4000)); + +CREATE TABLE "APP"."SKEWED_VALUES" ("SD_ID_OID" BIGINT NOT NULL, "STRING_LIST_ID_EID" BIGINT NOT NULL, "INTEGER_IDX" INTEGER NOT NULL); + +CREATE TABLE "APP"."VERSION" ("VER_ID" BIGINT NOT NULL, "SCHEMA_VERSION" VARCHAR(127) NOT NULL, "VERSION_COMMENT" VARCHAR(255)); + +CREATE TABLE "APP"."CTLGS" ( + "CTLG_ID" BIGINT NOT NULL, + "NAME" VARCHAR(256) UNIQUE, + "DESC" VARCHAR(4000), + "LOCATION_URI" VARCHAR(4000) NOT NULL); + +-- ---------------------------------------------- +-- DML Statements +-- ---------------------------------------------- + +INSERT INTO "APP"."SEQUENCE_TABLE" ("SEQUENCE_NAME", "NEXT_VAL") SELECT * FROM (VALUES ('org.apache.hadoop.hive.metastore.model.MNotificationLog', 1)) tmp_table WHERE NOT EXISTS ( SELECT "NEXT_VAL" FROM "APP"."SEQUENCE_TABLE" WHERE "SEQUENCE_NAME" = 'org.apache.hadoop.hive.metastore.model.MNotificationLog'); + +-- ---------------------------------------------- +-- DDL Statements for indexes +-- ---------------------------------------------- + + +CREATE UNIQUE INDEX "APP"."ROLEENTITYINDEX" ON "APP"."ROLES" ("ROLE_NAME"); + +CREATE UNIQUE INDEX "APP"."UNIQUE_DATABASE" ON "APP"."DBS" ("NAME", "CTLG_NAME"); + +CREATE UNIQUE INDEX "APP"."USERROLEMAPINDEX" ON "APP"."ROLE_MAP" ("PRINCIPAL_NAME", "ROLE_ID", "GRANTOR", "GRANTOR_TYPE"); + +CREATE UNIQUE INDEX "APP"."GLOBALPRIVILEGEINDEX" ON "APP"."GLOBAL_PRIVS" ("AUTHORIZER", "PRINCIPAL_NAME", "PRINCIPAL_TYPE", "USER_PRIV", "GRANTOR", "GRANTOR_TYPE"); + +CREATE UNIQUE INDEX "APP"."UNIQUE_CATALOG" ON "APP"."CTLGS" ("NAME"); + + +-- ---------------------------------------------- +-- DDL Statements for keys +-- ---------------------------------------------- + +-- primary/unique +ALTER TABLE "APP"."CDS" ADD CONSTRAINT "SQL110922153006460" PRIMARY KEY ("CD_ID"); + +ALTER TABLE "APP"."PARTITION_KEYS" ADD CONSTRAINT "PARTITION_KEY_PK" PRIMARY KEY ("TBL_ID", "PKEY_NAME"); + +ALTER TABLE "APP"."SEQUENCE_TABLE" ADD CONSTRAINT "SEQUENCE_TABLE_PK" PRIMARY KEY ("SEQUENCE_NAME"); + +ALTER TABLE "APP"."SDS" ADD CONSTRAINT "SDS_PK" PRIMARY KEY ("SD_ID"); + +ALTER TABLE "APP"."SERDES" ADD CONSTRAINT "SERDES_PK" PRIMARY KEY ("SERDE_ID"); + +ALTER TABLE "APP"."ROLES" ADD CONSTRAINT "ROLES_PK" PRIMARY KEY ("ROLE_ID"); + +ALTER TABLE "APP"."SERDE_PARAMS" ADD CONSTRAINT "SERDE_PARAMS_PK" PRIMARY KEY ("SERDE_ID", "PARAM_KEY"); + +ALTER TABLE "APP"."TBLS" ADD CONSTRAINT "TBLS_PK" PRIMARY KEY ("TBL_ID"); + +ALTER TABLE "APP"."SD_PARAMS" ADD CONSTRAINT "SD_PARAMS_PK" PRIMARY KEY ("SD_ID", "PARAM_KEY"); + +ALTER TABLE "APP"."DATABASE_PARAMS" ADD CONSTRAINT "DATABASE_PARAMS_PK" PRIMARY KEY ("DB_ID", "PARAM_KEY"); + +ALTER TABLE "APP"."DBS" ADD CONSTRAINT "DBS_PK" PRIMARY KEY ("DB_ID"); + +ALTER TABLE "APP"."ROLE_MAP" ADD CONSTRAINT "ROLE_MAP_PK" PRIMARY KEY ("ROLE_GRANT_ID"); + +ALTER TABLE "APP"."GLOBAL_PRIVS" ADD CONSTRAINT "GLOBAL_PRIVS_PK" PRIMARY KEY ("USER_GRANT_ID"); + +ALTER TABLE "APP"."BUCKETING_COLS" ADD CONSTRAINT "BUCKETING_COLS_PK" PRIMARY KEY ("SD_ID", "INTEGER_IDX"); + +ALTER TABLE "APP"."SORT_COLS" ADD CONSTRAINT "SORT_COLS_PK" PRIMARY KEY ("SD_ID", "INTEGER_IDX"); + +ALTER TABLE "APP"."COLUMNS_V2" ADD CONSTRAINT "SQL110922153006740" PRIMARY KEY ("CD_ID", "COLUMN_NAME"); + +ALTER TABLE "APP"."TABLE_PARAMS" ADD CONSTRAINT "TABLE_PARAMS_PK" PRIMARY KEY ("TBL_ID", "PARAM_KEY"); + +ALTER TABLE "APP"."SKEWED_STRING_LIST" ADD CONSTRAINT "SKEWED_STRING_LIST_PK" PRIMARY KEY ("STRING_LIST_ID"); + +ALTER TABLE "APP"."SKEWED_STRING_LIST_VALUES" ADD CONSTRAINT "SKEWED_STRING_LIST_VALUES_PK" PRIMARY KEY ("STRING_LIST_ID", "INTEGER_IDX"); + +ALTER TABLE "APP"."SKEWED_COL_NAMES" ADD CONSTRAINT "SKEWED_COL_NAMES_PK" PRIMARY KEY ("SD_ID", "INTEGER_IDX"); + +ALTER TABLE "APP"."SKEWED_COL_VALUE_LOC_MAP" ADD CONSTRAINT "SKEWED_COL_VALUE_LOC_MAP_PK" PRIMARY KEY ("SD_ID", "STRING_LIST_ID_KID"); + +ALTER TABLE "APP"."SKEWED_VALUES" ADD CONSTRAINT "SKEWED_VALUES_PK" PRIMARY KEY ("SD_ID_OID", "INTEGER_IDX"); + +ALTER TABLE "APP"."CTLGS" ADD CONSTRAINT "CTLG_PK" PRIMARY KEY ("CTLG_ID"); + +-- foreign + +ALTER TABLE "APP"."PARTITION_KEYS" ADD CONSTRAINT "PARTITION_KEYS_FK1" FOREIGN KEY ("TBL_ID") REFERENCES "APP"."TBLS" ("TBL_ID") ON DELETE NO ACTION ON UPDATE NO ACTION; + +ALTER TABLE "APP"."SDS" ADD CONSTRAINT "SDS_FK1" FOREIGN KEY ("SERDE_ID") REFERENCES "APP"."SERDES" ("SERDE_ID") ON DELETE NO ACTION ON UPDATE NO ACTION; + +ALTER TABLE "APP"."SDS" ADD CONSTRAINT "SDS_FK2" FOREIGN KEY ("CD_ID") REFERENCES "APP"."CDS" ("CD_ID") ON DELETE NO ACTION ON UPDATE NO ACTION; + +ALTER TABLE "APP"."SERDE_PARAMS" ADD CONSTRAINT "SERDE_PARAMS_FK1" FOREIGN KEY ("SERDE_ID") REFERENCES "APP"."SERDES" ("SERDE_ID") ON DELETE NO ACTION ON UPDATE NO ACTION; + +ALTER TABLE "APP"."TBLS" ADD CONSTRAINT "TBLS_FK2" FOREIGN KEY ("SD_ID") REFERENCES "APP"."SDS" ("SD_ID") ON DELETE NO ACTION ON UPDATE NO ACTION; + +ALTER TABLE "APP"."TBLS" ADD CONSTRAINT "TBLS_FK1" FOREIGN KEY ("DB_ID") REFERENCES "APP"."DBS" ("DB_ID") ON DELETE NO ACTION ON UPDATE NO ACTION; + +ALTER TABLE "APP"."DBS" ADD CONSTRAINT "DBS_FK1" FOREIGN KEY ("CTLG_NAME") REFERENCES "APP"."CTLGS" ("NAME") ON DELETE NO ACTION ON UPDATE NO ACTION; + +ALTER TABLE "APP"."SD_PARAMS" ADD CONSTRAINT "SD_PARAMS_FK1" FOREIGN KEY ("SD_ID") REFERENCES "APP"."SDS" ("SD_ID") ON DELETE NO ACTION ON UPDATE NO ACTION; + +ALTER TABLE "APP"."DATABASE_PARAMS" ADD CONSTRAINT "DATABASE_PARAMS_FK1" FOREIGN KEY ("DB_ID") REFERENCES "APP"."DBS" ("DB_ID") ON DELETE NO ACTION ON UPDATE NO ACTION; + +ALTER TABLE "APP"."ROLE_MAP" ADD CONSTRAINT "ROLE_MAP_FK1" FOREIGN KEY ("ROLE_ID") REFERENCES "APP"."ROLES" ("ROLE_ID") ON DELETE NO ACTION ON UPDATE NO ACTION; + +ALTER TABLE "APP"."BUCKETING_COLS" ADD CONSTRAINT "BUCKETING_COLS_FK1" FOREIGN KEY ("SD_ID") REFERENCES "APP"."SDS" ("SD_ID") ON DELETE NO ACTION ON UPDATE NO ACTION; + +ALTER TABLE "APP"."SORT_COLS" ADD CONSTRAINT "SORT_COLS_FK1" FOREIGN KEY ("SD_ID") REFERENCES "APP"."SDS" ("SD_ID") ON DELETE NO ACTION ON UPDATE NO ACTION; + +ALTER TABLE "APP"."COLUMNS_V2" ADD CONSTRAINT "COLUMNS_V2_FK1" FOREIGN KEY ("CD_ID") REFERENCES "APP"."CDS" ("CD_ID") ON DELETE NO ACTION ON UPDATE NO ACTION; + +ALTER TABLE "APP"."TABLE_PARAMS" ADD CONSTRAINT "TABLE_PARAMS_FK1" FOREIGN KEY ("TBL_ID") REFERENCES "APP"."TBLS" ("TBL_ID") ON DELETE NO ACTION ON UPDATE NO ACTION; + +ALTER TABLE "APP"."SKEWED_STRING_LIST_VALUES" ADD CONSTRAINT "SKEWED_STRING_LIST_VALUES_FK1" FOREIGN KEY ("STRING_LIST_ID") REFERENCES "APP"."SKEWED_STRING_LIST" ("STRING_LIST_ID") ON DELETE NO ACTION ON UPDATE NO ACTION; + +ALTER TABLE "APP"."SKEWED_COL_NAMES" ADD CONSTRAINT "SKEWED_COL_NAMES_FK1" FOREIGN KEY ("SD_ID") REFERENCES "APP"."SDS" ("SD_ID") ON DELETE NO ACTION ON UPDATE NO ACTION; + +ALTER TABLE "APP"."SKEWED_COL_VALUE_LOC_MAP" ADD CONSTRAINT "SKEWED_COL_VALUE_LOC_MAP_FK1" FOREIGN KEY ("SD_ID") REFERENCES "APP"."SDS" ("SD_ID") ON DELETE NO ACTION ON UPDATE NO ACTION; + +ALTER TABLE "APP"."SKEWED_COL_VALUE_LOC_MAP" ADD CONSTRAINT "SKEWED_COL_VALUE_LOC_MAP_FK2" FOREIGN KEY ("STRING_LIST_ID_KID") REFERENCES "APP"."SKEWED_STRING_LIST" ("STRING_LIST_ID") ON DELETE NO ACTION ON UPDATE NO ACTION; + +ALTER TABLE "APP"."SKEWED_VALUES" ADD CONSTRAINT "SKEWED_VALUES_FK1" FOREIGN KEY ("SD_ID_OID") REFERENCES "APP"."SDS" ("SD_ID") ON DELETE NO ACTION ON UPDATE NO ACTION; + +ALTER TABLE "APP"."SKEWED_VALUES" ADD CONSTRAINT "SKEWED_VALUES_FK2" FOREIGN KEY ("STRING_LIST_ID_EID") REFERENCES "APP"."SKEWED_STRING_LIST" ("STRING_LIST_ID") ON DELETE NO ACTION ON UPDATE NO ACTION; + +ALTER TABLE "APP"."VERSION" ADD CONSTRAINT "VERSION_PK" PRIMARY KEY ("VER_ID"); + +ALTER TABLE "APP"."DBS" ADD CONSTRAINT "DBS_CTLG_FK" FOREIGN KEY ("CTLG_NAME") REFERENCES "APP"."CTLGS" ("NAME") ON DELETE NO ACTION ON UPDATE NO ACTION; + +-- ---------------------------------------------- +-- DDL Statements for checks +-- ---------------------------------------------- + +ALTER TABLE "APP"."SDS" ADD CONSTRAINT "SQL110318025505550" CHECK (IS_COMPRESSED IN ('Y','N')); + +-- ---------------------------- +-- Transaction and Lock Tables +-- ---------------------------- +CREATE TABLE HIVE_LOCKS ( + HL_LOCK_EXT_ID bigint NOT NULL, + HL_LOCK_INT_ID bigint NOT NULL, + HL_TXNID bigint NOT NULL, + HL_DB varchar(128) NOT NULL, + HL_TABLE varchar(128), + HL_PARTITION varchar(767), + HL_LOCK_STATE char(1) NOT NULL, + HL_LOCK_TYPE char(1) NOT NULL, + HL_LAST_HEARTBEAT bigint NOT NULL, + HL_ACQUIRED_AT bigint, + HL_USER varchar(128) NOT NULL, + HL_HOST varchar(128) NOT NULL, + HL_HEARTBEAT_COUNT integer, + HL_AGENT_INFO varchar(128), + HL_BLOCKEDBY_EXT_ID bigint, + HL_BLOCKEDBY_INT_ID bigint, + PRIMARY KEY(HL_LOCK_EXT_ID, HL_LOCK_INT_ID) +); + +CREATE INDEX HL_TXNID_INDEX ON HIVE_LOCKS (HL_TXNID); + +CREATE TABLE NEXT_LOCK_ID ( + NL_NEXT bigint NOT NULL +); +INSERT INTO NEXT_LOCK_ID VALUES(1); + +CREATE TABLE AUX_TABLE ( + MT_KEY1 varchar(128) NOT NULL, + MT_KEY2 bigint NOT NULL, + MT_COMMENT varchar(255), + PRIMARY KEY(MT_KEY1, MT_KEY2) +); + +--1st 4 cols make up a PK but since WS_PARTITION is nullable we can't declare such PK +--This is a good candidate for Index orgainzed table + +-- ----------------------------------------------------------------- +-- Record schema version. Should be the last step in the init script +-- ----------------------------------------------------------------- +INSERT INTO "APP"."VERSION" (VER_ID, SCHEMA_VERSION, VERSION_COMMENT) VALUES (1, '3.1.0', 'Hive release version 3.1.0'); \ No newline at end of file diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergCatalogConfig.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergCatalogConfig.java index 2956d75a266e7..5307047354b8b 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergCatalogConfig.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergCatalogConfig.java @@ -19,19 +19,27 @@ import com.google.auto.value.AutoValue; import java.io.Serializable; -import java.util.Properties; +import java.util.Map; +import org.apache.beam.sdk.util.ReleaseInfo; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Maps; import org.apache.hadoop.conf.Configuration; import org.apache.iceberg.CatalogUtil; +import org.checkerframework.checker.nullness.qual.Nullable; import org.checkerframework.dataflow.qual.Pure; @AutoValue public abstract class IcebergCatalogConfig implements Serializable { @Pure + @Nullable public abstract String getCatalogName(); @Pure - public abstract Properties getProperties(); + @Nullable + public abstract Map getCatalogProperties(); + + @Pure + @Nullable + public abstract Map getConfigProperties(); @Pure public static Builder builder() { @@ -39,15 +47,32 @@ public static Builder builder() { } public org.apache.iceberg.catalog.Catalog catalog() { - return CatalogUtil.buildIcebergCatalog( - getCatalogName(), Maps.fromProperties(getProperties()), new Configuration()); + String catalogName = getCatalogName(); + if (catalogName == null) { + catalogName = "apache-beam-" + ReleaseInfo.getReleaseInfo().getVersion(); + } + Map catalogProps = getCatalogProperties(); + if (catalogProps == null) { + catalogProps = Maps.newHashMap(); + } + Map confProps = getConfigProperties(); + if (confProps == null) { + confProps = Maps.newHashMap(); + } + Configuration config = new Configuration(); + for (Map.Entry prop : confProps.entrySet()) { + config.set(prop.getKey(), prop.getValue()); + } + return CatalogUtil.buildIcebergCatalog(catalogName, catalogProps, config); } @AutoValue.Builder public abstract static class Builder { - public abstract Builder setCatalogName(String catalogName); + public abstract Builder setCatalogName(@Nullable String catalogName); + + public abstract Builder setCatalogProperties(@Nullable Map props); - public abstract Builder setProperties(Properties props); + public abstract Builder setConfigProperties(@Nullable Map props); public abstract IcebergCatalogConfig build(); } diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergReadSchemaTransformProvider.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergReadSchemaTransformProvider.java index ef535353efd01..df7bda4560dd5 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergReadSchemaTransformProvider.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergReadSchemaTransformProvider.java @@ -18,18 +18,11 @@ package org.apache.beam.sdk.io.iceberg; import com.google.auto.service.AutoService; -import com.google.auto.value.AutoValue; import java.util.Collections; import java.util.List; -import java.util.Map; -import java.util.Properties; -import org.apache.beam.sdk.io.iceberg.IcebergReadSchemaTransformProvider.Config; import org.apache.beam.sdk.managed.ManagedTransformConstants; -import org.apache.beam.sdk.schemas.AutoValueSchema; import org.apache.beam.sdk.schemas.NoSuchSchemaException; import org.apache.beam.sdk.schemas.SchemaRegistry; -import org.apache.beam.sdk.schemas.annotations.DefaultSchema; -import org.apache.beam.sdk.schemas.annotations.SchemaFieldDescription; import org.apache.beam.sdk.schemas.transforms.SchemaTransform; import org.apache.beam.sdk.schemas.transforms.SchemaTransformProvider; import org.apache.beam.sdk.schemas.transforms.TypedSchemaTransformProvider; @@ -44,11 +37,12 @@ * org.apache.beam.sdk.values.Row}s. */ @AutoService(SchemaTransformProvider.class) -public class IcebergReadSchemaTransformProvider extends TypedSchemaTransformProvider { +public class IcebergReadSchemaTransformProvider + extends TypedSchemaTransformProvider { static final String OUTPUT_TAG = "output"; @Override - protected SchemaTransform from(Config configuration) { + protected SchemaTransform from(SchemaTransformConfiguration configuration) { return new IcebergReadSchemaTransform(configuration); } @@ -62,38 +56,10 @@ public String identifier() { return ManagedTransformConstants.ICEBERG_READ; } - @DefaultSchema(AutoValueSchema.class) - @AutoValue - public abstract static class Config { - public static Builder builder() { - return new AutoValue_IcebergReadSchemaTransformProvider_Config.Builder(); - } - - @SchemaFieldDescription("Identifier of the Iceberg table to write to.") - public abstract String getTable(); - - @SchemaFieldDescription("Name of the catalog containing the table.") - public abstract String getCatalogName(); - - @SchemaFieldDescription("Configuration properties used to set up the Iceberg catalog.") - public abstract Map getCatalogProperties(); - - @AutoValue.Builder - public abstract static class Builder { - public abstract Builder setTable(String table); - - public abstract Builder setCatalogName(String catalogName); - - public abstract Builder setCatalogProperties(Map catalogProperties); - - public abstract Config build(); - } - } - static class IcebergReadSchemaTransform extends SchemaTransform { - private final Config configuration; + private final SchemaTransformConfiguration configuration; - IcebergReadSchemaTransform(Config configuration) { + IcebergReadSchemaTransform(SchemaTransformConfiguration configuration) { this.configuration = configuration; } @@ -102,7 +68,7 @@ Row getConfigurationRow() { // To stay consistent with our SchemaTransform configuration naming conventions, // we sort lexicographically and convert field names to snake_case return SchemaRegistry.createDefault() - .getToRowFunction(Config.class) + .getToRowFunction(SchemaTransformConfiguration.class) .apply(configuration) .sorted() .toSnakeCase(); @@ -113,19 +79,11 @@ Row getConfigurationRow() { @Override public PCollectionRowTuple expand(PCollectionRowTuple input) { - Properties properties = new Properties(); - properties.putAll(configuration.getCatalogProperties()); - - IcebergCatalogConfig.Builder catalogBuilder = - IcebergCatalogConfig.builder() - .setCatalogName(configuration.getCatalogName()) - .setProperties(properties); - PCollection output = input .getPipeline() .apply( - IcebergIO.readRows(catalogBuilder.build()) + IcebergIO.readRows(configuration.getIcebergCatalog()) .from(TableIdentifier.parse(configuration.getTable()))); return PCollectionRowTuple.of(OUTPUT_TAG, output); diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergWriteSchemaTransformProvider.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergWriteSchemaTransformProvider.java index b3de7a88c541d..3f0f88946d9ca 100644 --- a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergWriteSchemaTransformProvider.java +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergWriteSchemaTransformProvider.java @@ -18,19 +18,12 @@ package org.apache.beam.sdk.io.iceberg; import com.google.auto.service.AutoService; -import com.google.auto.value.AutoValue; import java.util.Collections; import java.util.List; -import java.util.Map; -import java.util.Properties; -import org.apache.beam.sdk.io.iceberg.IcebergWriteSchemaTransformProvider.Config; import org.apache.beam.sdk.managed.ManagedTransformConstants; -import org.apache.beam.sdk.schemas.AutoValueSchema; import org.apache.beam.sdk.schemas.NoSuchSchemaException; import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.schemas.SchemaRegistry; -import org.apache.beam.sdk.schemas.annotations.DefaultSchema; -import org.apache.beam.sdk.schemas.annotations.SchemaFieldDescription; import org.apache.beam.sdk.schemas.transforms.SchemaTransform; import org.apache.beam.sdk.schemas.transforms.SchemaTransformProvider; import org.apache.beam.sdk.schemas.transforms.TypedSchemaTransformProvider; @@ -48,7 +41,8 @@ * outputs a {@code PCollection} representing snapshots created in the process. */ @AutoService(SchemaTransformProvider.class) -public class IcebergWriteSchemaTransformProvider extends TypedSchemaTransformProvider { +public class IcebergWriteSchemaTransformProvider + extends TypedSchemaTransformProvider { static final String INPUT_TAG = "input"; static final String OUTPUT_TAG = "output"; @@ -64,7 +58,7 @@ public String description() { } @Override - protected SchemaTransform from(Config configuration) { + protected SchemaTransform from(SchemaTransformConfiguration configuration) { return new IcebergWriteSchemaTransform(configuration); } @@ -83,38 +77,10 @@ public String identifier() { return ManagedTransformConstants.ICEBERG_WRITE; } - @DefaultSchema(AutoValueSchema.class) - @AutoValue - public abstract static class Config { - public static Builder builder() { - return new AutoValue_IcebergWriteSchemaTransformProvider_Config.Builder(); - } - - @SchemaFieldDescription("Identifier of the Iceberg table to write to.") - public abstract String getTable(); - - @SchemaFieldDescription("Name of the catalog containing the table.") - public abstract String getCatalogName(); - - @SchemaFieldDescription("Configuration properties used to set up the Iceberg catalog.") - public abstract Map getCatalogProperties(); - - @AutoValue.Builder - public abstract static class Builder { - public abstract Builder setTable(String table); - - public abstract Builder setCatalogName(String catalogName); - - public abstract Builder setCatalogProperties(Map catalogProperties); - - public abstract Config build(); - } - } - static class IcebergWriteSchemaTransform extends SchemaTransform { - private final Config configuration; + private final SchemaTransformConfiguration configuration; - IcebergWriteSchemaTransform(Config configuration) { + IcebergWriteSchemaTransform(SchemaTransformConfiguration configuration) { this.configuration = configuration; } @@ -123,7 +89,7 @@ Row getConfigurationRow() { // To stay consistent with our SchemaTransform configuration naming conventions, // we sort lexicographically and convert field names to snake_case return SchemaRegistry.createDefault() - .getToRowFunction(Config.class) + .getToRowFunction(SchemaTransformConfiguration.class) .apply(configuration) .sorted() .toSnakeCase(); @@ -136,19 +102,11 @@ Row getConfigurationRow() { public PCollectionRowTuple expand(PCollectionRowTuple input) { PCollection rows = input.get(INPUT_TAG); - Properties properties = new Properties(); - properties.putAll(configuration.getCatalogProperties()); - - IcebergCatalogConfig catalog = - IcebergCatalogConfig.builder() - .setCatalogName(configuration.getCatalogName()) - .setProperties(properties) - .build(); - // TODO: support dynamic destinations IcebergWriteResult result = rows.apply( - IcebergIO.writeRows(catalog).to(TableIdentifier.parse(configuration.getTable()))); + IcebergIO.writeRows(configuration.getIcebergCatalog()) + .to(TableIdentifier.parse(configuration.getTable()))); PCollection snapshots = result diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/SchemaTransformConfiguration.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/SchemaTransformConfiguration.java new file mode 100644 index 0000000000000..6e7a12aa15afc --- /dev/null +++ b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/SchemaTransformConfiguration.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.iceberg; + +import com.google.auto.value.AutoValue; +import java.util.Map; +import org.apache.beam.sdk.schemas.AutoValueSchema; +import org.apache.beam.sdk.schemas.annotations.DefaultSchema; +import org.apache.beam.sdk.schemas.annotations.SchemaFieldDescription; +import org.checkerframework.checker.nullness.qual.Nullable; + +@DefaultSchema(AutoValueSchema.class) +@AutoValue +public abstract class SchemaTransformConfiguration { + public static Builder builder() { + return new AutoValue_SchemaTransformConfiguration.Builder(); + } + + @SchemaFieldDescription("Identifier of the Iceberg table.") + public abstract String getTable(); + + @SchemaFieldDescription("Name of the catalog containing the table.") + @Nullable + public abstract String getCatalogName(); + + @SchemaFieldDescription("Properties used to set up the Iceberg catalog.") + @Nullable + public abstract Map getCatalogProperties(); + + @SchemaFieldDescription("Properties passed to the Hadoop Configuration.") + @Nullable + public abstract Map getConfigProperties(); + + @AutoValue.Builder + public abstract static class Builder { + public abstract Builder setTable(String table); + + public abstract Builder setCatalogName(String catalogName); + + public abstract Builder setCatalogProperties(Map catalogProperties); + + public abstract Builder setConfigProperties(Map confProperties); + + public abstract SchemaTransformConfiguration build(); + } + + public IcebergCatalogConfig getIcebergCatalog() { + return IcebergCatalogConfig.builder() + .setCatalogName(getCatalogName()) + .setCatalogProperties(getCatalogProperties()) + .setConfigProperties(getConfigProperties()) + .build(); + } +} diff --git a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergIOReadTest.java b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergIOReadTest.java index 3f31073b44486..fe4a07dedfdf9 100644 --- a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergIOReadTest.java +++ b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergIOReadTest.java @@ -21,7 +21,7 @@ import static org.hamcrest.Matchers.containsInAnyOrder; import java.util.List; -import java.util.Properties; +import java.util.Map; import java.util.UUID; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -33,6 +33,7 @@ import org.apache.beam.sdk.transforms.ParDo; import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.Row; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.apache.iceberg.CatalogUtil; import org.apache.iceberg.Table; import org.apache.iceberg.catalog.TableIdentifier; @@ -94,12 +95,17 @@ public void testSimpleScan() throws Exception { .map(record -> IcebergUtils.icebergRecordToBeamRow(schema, record)) .collect(Collectors.toList()); - Properties props = new Properties(); - props.setProperty("type", CatalogUtil.ICEBERG_CATALOG_TYPE_HADOOP); - props.setProperty("warehouse", warehouse.location); + Map catalogProps = + ImmutableMap.builder() + .put("type", CatalogUtil.ICEBERG_CATALOG_TYPE_HADOOP) + .put("warehouse", warehouse.location) + .build(); IcebergCatalogConfig catalogConfig = - IcebergCatalogConfig.builder().setCatalogName("name").setProperties(props).build(); + IcebergCatalogConfig.builder() + .setCatalogName("name") + .setCatalogProperties(catalogProps) + .build(); PCollection output = testPipeline diff --git a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergIOWriteTest.java b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergIOWriteTest.java index 02213c45e0756..2abe6b0934819 100644 --- a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergIOWriteTest.java +++ b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergIOWriteTest.java @@ -23,7 +23,6 @@ import java.io.Serializable; import java.util.List; import java.util.Map; -import java.util.Properties; import java.util.UUID; import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.testing.TestPipeline; @@ -76,12 +75,17 @@ public void testSimpleAppend() throws Exception { // Create a table and add records to it. Table table = warehouse.createTable(tableId, TestFixtures.SCHEMA); - Properties props = new Properties(); - props.setProperty("type", CatalogUtil.ICEBERG_CATALOG_TYPE_HADOOP); - props.setProperty("warehouse", warehouse.location); + Map catalogProps = + ImmutableMap.builder() + .put("type", CatalogUtil.ICEBERG_CATALOG_TYPE_HADOOP) + .put("warehouse", warehouse.location) + .build(); IcebergCatalogConfig catalog = - IcebergCatalogConfig.builder().setCatalogName("name").setProperties(props).build(); + IcebergCatalogConfig.builder() + .setCatalogName("name") + .setCatalogProperties(catalogProps) + .build(); testPipeline .apply("Records To Add", Create.of(TestFixtures.asRows(TestFixtures.FILE1SNAPSHOT1))) @@ -110,12 +114,17 @@ public void testDynamicDestinationsWithoutSpillover() throws Exception { Table table2 = warehouse.createTable(table2Id, TestFixtures.SCHEMA); Table table3 = warehouse.createTable(table3Id, TestFixtures.SCHEMA); - Properties props = new Properties(); - props.setProperty("type", CatalogUtil.ICEBERG_CATALOG_TYPE_HADOOP); - props.setProperty("warehouse", warehouse.location); + Map catalogProps = + ImmutableMap.builder() + .put("type", CatalogUtil.ICEBERG_CATALOG_TYPE_HADOOP) + .put("warehouse", warehouse.location) + .build(); IcebergCatalogConfig catalog = - IcebergCatalogConfig.builder().setCatalogName("name").setProperties(props).build(); + IcebergCatalogConfig.builder() + .setCatalogName("name") + .setCatalogProperties(catalogProps) + .build(); DynamicDestinations dynamicDestinations = new DynamicDestinations() { @@ -200,12 +209,17 @@ public void testDynamicDestinationsWithSpillover() throws Exception { elementsPerTable.computeIfAbsent(tableId, ignored -> Lists.newArrayList()).add(element); } - Properties props = new Properties(); - props.setProperty("type", CatalogUtil.ICEBERG_CATALOG_TYPE_HADOOP); - props.setProperty("warehouse", warehouse.location); + Map catalogProps = + ImmutableMap.builder() + .put("type", CatalogUtil.ICEBERG_CATALOG_TYPE_HADOOP) + .put("warehouse", warehouse.location) + .build(); IcebergCatalogConfig catalog = - IcebergCatalogConfig.builder().setCatalogName("name").setProperties(props).build(); + IcebergCatalogConfig.builder() + .setCatalogName("name") + .setCatalogProperties(catalogProps) + .build(); DynamicDestinations dynamicDestinations = new DynamicDestinations() { diff --git a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergReadSchemaTransformProviderTest.java b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergReadSchemaTransformProviderTest.java index effb5cc4838e9..0311c31da4058 100644 --- a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergReadSchemaTransformProviderTest.java +++ b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergReadSchemaTransformProviderTest.java @@ -101,8 +101,8 @@ public void testSimpleScan() throws Exception { properties.put("type", CatalogUtil.ICEBERG_CATALOG_TYPE_HADOOP); properties.put("warehouse", warehouse.location); - IcebergReadSchemaTransformProvider.Config readConfig = - IcebergReadSchemaTransformProvider.Config.builder() + SchemaTransformConfiguration readConfig = + SchemaTransformConfiguration.builder() .setTable(identifier) .setCatalogName("name") .setCatalogProperties(properties) diff --git a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergSchemaTransformTranslationTest.java b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergSchemaTransformTranslationTest.java index 7863f7812a13f..86a5e0bcd432f 100644 --- a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergSchemaTransformTranslationTest.java +++ b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergSchemaTransformTranslationTest.java @@ -71,6 +71,8 @@ public class IcebergSchemaTransformTranslationTest { .put("type", CatalogUtil.ICEBERG_CATALOG_TYPE_HADOOP) .put("warehouse", "test_location") .build(); + private static final Map CONFIG_PROPERTIES = + ImmutableMap.builder().put("key", "value").put("key2", "value2").build(); @Test public void testReCreateWriteTransformFromRow() { @@ -79,6 +81,7 @@ public void testReCreateWriteTransformFromRow() { .withFieldValue("table", "test_table_identifier") .withFieldValue("catalog_name", "test-name") .withFieldValue("catalog_properties", CATALOG_PROPERTIES) + .withFieldValue("config_properties", CONFIG_PROPERTIES) .build(); IcebergWriteSchemaTransform writeTransform = (IcebergWriteSchemaTransform) WRITE_PROVIDER.from(transformConfigRow); @@ -110,6 +113,7 @@ public void testWriteTransformProtoTranslation() .withFieldValue("table", "test_identifier") .withFieldValue("catalog_name", "test-name") .withFieldValue("catalog_properties", CATALOG_PROPERTIES) + .withFieldValue("config_properties", CONFIG_PROPERTIES) .build(); IcebergWriteSchemaTransform writeTransform = @@ -161,6 +165,7 @@ public void testReCreateReadTransformFromRow() { .withFieldValue("table", "test_table_identifier") .withFieldValue("catalog_name", "test-name") .withFieldValue("catalog_properties", CATALOG_PROPERTIES) + .withFieldValue("config_properties", CONFIG_PROPERTIES) .build(); IcebergReadSchemaTransform readTransform = @@ -192,6 +197,7 @@ public void testReadTransformProtoTranslation() .withFieldValue("table", identifier) .withFieldValue("catalog_name", "test-name") .withFieldValue("catalog_properties", properties) + .withFieldValue("config_properties", CONFIG_PROPERTIES) .build(); IcebergReadSchemaTransform readTransform = diff --git a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergWriteSchemaTransformProviderTest.java b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergWriteSchemaTransformProviderTest.java index a2cd64e239567..6b555e7e14d0f 100644 --- a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergWriteSchemaTransformProviderTest.java +++ b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/IcebergWriteSchemaTransformProviderTest.java @@ -17,7 +17,6 @@ */ package org.apache.beam.sdk.io.iceberg; -import static org.apache.beam.sdk.io.iceberg.IcebergWriteSchemaTransformProvider.Config; import static org.apache.beam.sdk.io.iceberg.IcebergWriteSchemaTransformProvider.INPUT_TAG; import static org.apache.beam.sdk.io.iceberg.IcebergWriteSchemaTransformProvider.OUTPUT_TAG; import static org.hamcrest.MatcherAssert.assertThat; @@ -89,8 +88,8 @@ public void testSimpleAppend() { properties.put("type", CatalogUtil.ICEBERG_CATALOG_TYPE_HADOOP); properties.put("warehouse", warehouse.location); - Config config = - Config.builder() + SchemaTransformConfiguration config = + SchemaTransformConfiguration.builder() .setTable(identifier) .setCatalogName("name") .setCatalogProperties(properties) diff --git a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/ScanSourceTest.java b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/ScanSourceTest.java index 007cb028c665d..38a15cb2aa98f 100644 --- a/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/ScanSourceTest.java +++ b/sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/ScanSourceTest.java @@ -20,13 +20,14 @@ import static org.hamcrest.MatcherAssert.assertThat; import java.util.List; -import java.util.Properties; +import java.util.Map; import java.util.UUID; import org.apache.beam.sdk.io.BoundedSource; import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.testing.SourceTestUtils; import org.apache.beam.sdk.values.Row; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.apache.iceberg.CatalogUtil; import org.apache.iceberg.Table; import org.apache.iceberg.catalog.TableIdentifier; @@ -65,9 +66,11 @@ public void testUnstartedReaderReadsSamesItsSource() throws Exception { PipelineOptions options = PipelineOptionsFactory.create(); - Properties props = new Properties(); - props.setProperty("type", CatalogUtil.ICEBERG_CATALOG_TYPE_HADOOP); - props.setProperty("warehouse", warehouse.location); + Map catalogProps = + ImmutableMap.builder() + .put("type", CatalogUtil.ICEBERG_CATALOG_TYPE_HADOOP) + .put("warehouse", warehouse.location) + .build(); BoundedSource source = new ScanSource( @@ -75,7 +78,7 @@ public void testUnstartedReaderReadsSamesItsSource() throws Exception { .setCatalogConfig( IcebergCatalogConfig.builder() .setCatalogName("name") - .setProperties(props) + .setCatalogProperties(catalogProps) .build()) .setScanType(IcebergScanConfig.ScanType.TABLE) .setTableIdentifier(simpleTable.name().replace("hadoop.", "").split("\\.")) @@ -107,9 +110,11 @@ public void testInitialSplitting() throws Exception { PipelineOptions options = PipelineOptionsFactory.create(); - Properties props = new Properties(); - props.setProperty("type", CatalogUtil.ICEBERG_CATALOG_TYPE_HADOOP); - props.setProperty("warehouse", warehouse.location); + Map catalogProps = + ImmutableMap.builder() + .put("type", CatalogUtil.ICEBERG_CATALOG_TYPE_HADOOP) + .put("warehouse", warehouse.location) + .build(); BoundedSource source = new ScanSource( @@ -117,7 +122,7 @@ public void testInitialSplitting() throws Exception { .setCatalogConfig( IcebergCatalogConfig.builder() .setCatalogName("name") - .setProperties(props) + .setCatalogProperties(catalogProps) .build()) .setScanType(IcebergScanConfig.ScanType.TABLE) .setTableIdentifier(simpleTable.name().replace("hadoop.", "").split("\\.")) @@ -153,9 +158,11 @@ public void testDoubleInitialSplitting() throws Exception { PipelineOptions options = PipelineOptionsFactory.create(); - Properties props = new Properties(); - props.setProperty("type", CatalogUtil.ICEBERG_CATALOG_TYPE_HADOOP); - props.setProperty("warehouse", warehouse.location); + Map catalogProps = + ImmutableMap.builder() + .put("type", CatalogUtil.ICEBERG_CATALOG_TYPE_HADOOP) + .put("warehouse", warehouse.location) + .build(); BoundedSource source = new ScanSource( @@ -163,7 +170,7 @@ public void testDoubleInitialSplitting() throws Exception { .setCatalogConfig( IcebergCatalogConfig.builder() .setCatalogName("name") - .setProperties(props) + .setCatalogProperties(catalogProps) .build()) .setScanType(IcebergScanConfig.ScanType.TABLE) .setTableIdentifier(simpleTable.name().replace("hadoop.", "").split("\\.")) diff --git a/settings.gradle.kts b/settings.gradle.kts index 4d4b93908a02f..65a55885afa72 100644 --- a/settings.gradle.kts +++ b/settings.gradle.kts @@ -365,3 +365,7 @@ include("sdks:java:io:solace") findProject(":sdks:java:io:solace")?.name = "solace" include("sdks:java:extensions:combiners") findProject(":sdks:java:extensions:combiners")?.name = "combiners" +include("sdks:java:io:iceberg:hive") +findProject(":sdks:java:io:iceberg:hive")?.name = "hive" +include("sdks:java:io:iceberg:hive:exec") +findProject(":sdks:java:io:iceberg:hive:exec")?.name = "exec" From fc5a71db5caa95fd14988bfe475c240873216a2c Mon Sep 17 00:00:00 2001 From: Francis O'Hara Date: Fri, 9 Aug 2024 23:12:22 +0000 Subject: [PATCH 58/78] [CsvIO]: Implement CsvIOParse::withCustomRecordParsing method (#32142) * completed implementation without tests Co-authored-by: Lahari Guduru * intermediate stage Co-authored-by: Lahari Guduru * Implement CsvIOParse.withCustomRecordParsing Co-authored-by: Lahari Guduru --------- Co-authored-by: Lahari Guduru --- .../apache/beam/sdk/io/csv/CsvIOParse.java | 28 +++- .../sdk/io/csv/CsvIOParseConfiguration.java | 12 +- .../beam/sdk/io/csv/CsvIOParseTest.java | 127 +++++++++++++++++- 3 files changed, 158 insertions(+), 9 deletions(-) diff --git a/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIOParse.java b/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIOParse.java index 0a27cdbc57eca..5981e81327652 100644 --- a/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIOParse.java +++ b/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIOParse.java @@ -28,6 +28,7 @@ import org.apache.beam.sdk.values.PCollectionList; import org.apache.beam.sdk.values.PCollectionTuple; import org.apache.beam.sdk.values.TupleTag; +import org.checkerframework.checker.nullness.qual.NonNull; /** * {@link PTransform} for Parsing CSV Record Strings into {@link Schema}-mapped target types. {@link @@ -43,9 +44,30 @@ static CsvIOParse.Builder builder() { return new AutoValue_CsvIOParse.Builder<>(); } - // TODO(https://github.com/apache/beam/issues/31875): Implement in future PR. - public CsvIOParse withCustomRecordParsing( - Map> customProcessingMap) { + /** + * Configures custom cell parsing. + * + *

    Example

    + * + *
    {@code
    +   * CsvIO.parse().withCustomRecordParsing("listOfInts", cell-> {
    +   *
    +   *  List result = new ArrayList<>();
    +   *  for (String stringValue: Splitter.on(";").split(cell)) {
    +   *    result.add(Integer.parseInt(stringValue));
    +   *  }
    +   *
    +   * });
    +   * }
    + */ + public CsvIOParse withCustomRecordParsing( + String fieldName, SerializableFunction customRecordParsingFn) { + + Map> customProcessingMap = + getConfigBuilder().getOrCreateCustomProcessingMap(); + + customProcessingMap.put(fieldName, customRecordParsingFn::apply); + getConfigBuilder().setCustomProcessingMap(customProcessingMap); return this; } diff --git a/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIOParseConfiguration.java b/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIOParseConfiguration.java index dd9ef5b348686..2be871a9dc2dc 100644 --- a/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIOParseConfiguration.java +++ b/sdks/java/io/csv/src/main/java/org/apache/beam/sdk/io/csv/CsvIOParseConfiguration.java @@ -60,18 +60,26 @@ abstract static class Builder implements Serializable { abstract Builder setCustomProcessingMap( Map> customProcessingMap); + abstract Optional>> getCustomProcessingMap(); + + final Map> getOrCreateCustomProcessingMap() { + if (!getCustomProcessingMap().isPresent()) { + setCustomProcessingMap(new HashMap<>()); + } + return getCustomProcessingMap().get(); + } + abstract Builder setCoder(Coder coder); abstract Builder setFromRowFn(SerializableFunction fromRowFn); - abstract Optional>> getCustomProcessingMap(); - abstract CsvIOParseConfiguration autoBuild(); final CsvIOParseConfiguration build() { if (!getCustomProcessingMap().isPresent()) { setCustomProcessingMap(new HashMap<>()); } + return autoBuild(); } } diff --git a/sdks/java/io/csv/src/test/java/org/apache/beam/sdk/io/csv/CsvIOParseTest.java b/sdks/java/io/csv/src/test/java/org/apache/beam/sdk/io/csv/CsvIOParseTest.java index 05d6982004f45..a517cef3d51f5 100644 --- a/sdks/java/io/csv/src/test/java/org/apache/beam/sdk/io/csv/CsvIOParseTest.java +++ b/sdks/java/io/csv/src/test/java/org/apache/beam/sdk/io/csv/CsvIOParseTest.java @@ -19,10 +19,17 @@ import static org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.NULLABLE_ALL_PRIMITIVE_DATA_TYPES_SCHEMA; import static org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.NULLABLE_ALL_PRIMITIVE_DATA_TYPES_TYPE_DESCRIPTOR; +import static org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.TIME_CONTAINING_SCHEMA; +import static org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.TIME_CONTAINING_TYPE_DESCRIPTOR; +import static org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.TimeContaining; import static org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.nullableAllPrimitiveDataTypes; import static org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.nullableAllPrimitiveDataTypesFromRowFn; import static org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.nullableAllPrimitiveDataTypesToRowFn; +import static org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.timeContaining; +import static org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.timeContainingFromRowFn; +import static org.apache.beam.sdk.io.common.SchemaAwareJavaBeans.timeContainingToRowFn; +import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; @@ -38,17 +45,22 @@ import org.apache.beam.sdk.schemas.SchemaCoder; import org.apache.beam.sdk.testing.PAssert; import org.apache.beam.sdk.testing.TestPipeline; +import org.apache.beam.sdk.transforms.Count; import org.apache.beam.sdk.transforms.Create; import org.apache.beam.sdk.transforms.SerializableFunction; import org.apache.beam.sdk.util.SerializableUtils; import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.Row; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Splitter; import org.apache.commons.csv.CSVFormat; +import org.joda.time.Instant; +import org.joda.time.format.DateTimeFormat; import org.junit.Rule; import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.JUnit4; +/** Tests for {@link CsvIOParse}. */ @RunWith(JUnit4.class) public class CsvIOParseTest { @@ -61,6 +73,12 @@ public class CsvIOParseTest { NULLABLE_ALL_PRIMITIVE_DATA_TYPES_TYPE_DESCRIPTOR, nullableAllPrimitiveDataTypesToRowFn(), nullableAllPrimitiveDataTypesFromRowFn()); + private static final Coder TIME_CONTAINING_CODER = + SchemaCoder.of( + TIME_CONTAINING_SCHEMA, + TIME_CONTAINING_TYPE_DESCRIPTOR, + timeContainingToRowFn(), + timeContainingFromRowFn()); private static final SerializableFunction ROW_ROW_SERIALIZABLE_FUNCTION = row -> row; @Rule public final TestPipeline pipeline = TestPipeline.create(); @@ -120,7 +138,7 @@ public void parseRows() { underTest( NULLABLE_ALL_PRIMITIVE_DATA_TYPES_SCHEMA, csvFormat(), - emptyCustomProcessingMap(), + new HashMap<>(), ROW_ROW_SERIALIZABLE_FUNCTION, RowCoder.of(NULLABLE_ALL_PRIMITIVE_DATA_TYPES_SCHEMA))); PAssert.that(result.getOutput()).containsInAnyOrder(want); @@ -152,7 +170,7 @@ public void parsePOJOs() { underTest( NULLABLE_ALL_PRIMITIVE_DATA_TYPES_SCHEMA, csvFormat(), - emptyCustomProcessingMap(), + new HashMap<>(), nullableAllPrimitiveDataTypesFromRowFn(), NULLABLE_ALL_PRIMITIVE_DATA_TYPES_CODER)); PAssert.that(result.getOutput()).containsInAnyOrder(want); @@ -161,6 +179,98 @@ public void parsePOJOs() { pipeline.run(); } + @Test + public void givenSingleCustomParsingLambda_parsesPOJOs() { + PCollection records = + csvRecords( + pipeline, + "instant,instantList", + "2024-01-23T10:00:05.000Z,10-00-05-2024-01-23;12-59-59-2024-01-24"); + TimeContaining want = + timeContaining( + Instant.parse("2024-01-23T10:00:05.000Z"), + Arrays.asList( + Instant.parse("2024-01-23T10:00:05.000Z"), + Instant.parse("2024-01-24T12:59:59.000Z"))); + + CsvIOParse underTest = + underTest( + TIME_CONTAINING_SCHEMA, + CSVFormat.DEFAULT + .withHeader("instant", "instantList") + .withAllowDuplicateHeaderNames(false), + new HashMap<>(), + timeContainingFromRowFn(), + TIME_CONTAINING_CODER) + .withCustomRecordParsing("instantList", instantListParsingLambda()); + + CsvIOParseResult result = records.apply(underTest); + PAssert.that(result.getOutput()).containsInAnyOrder(want); + PAssert.that(result.getErrors()).empty(); + + pipeline.run(); + } + + @Test + public void givenMultipleCustomParsingLambdas_parsesPOJOs() { + PCollection records = + csvRecords( + pipeline, + "instant,instantList", + "2024-01-23@10:00:05,10-00-05-2024-01-23;12-59-59-2024-01-24"); + TimeContaining want = + timeContaining( + Instant.parse("2024-01-23T10:00:05.000Z"), + Arrays.asList( + Instant.parse("2024-01-23T10:00:05.000Z"), + Instant.parse("2024-01-24T12:59:59.000Z"))); + + CsvIOParse underTest = + underTest( + TIME_CONTAINING_SCHEMA, + CSVFormat.DEFAULT + .withHeader("instant", "instantList") + .withAllowDuplicateHeaderNames(false), + new HashMap<>(), + timeContainingFromRowFn(), + TIME_CONTAINING_CODER) + .withCustomRecordParsing( + "instant", + input -> + DateTimeFormat.forPattern("yyyy-MM-dd@HH:mm:ss") + .parseDateTime(input) + .toInstant()) + .withCustomRecordParsing("instantList", instantListParsingLambda()); + + CsvIOParseResult result = records.apply(underTest); + PAssert.that(result.getOutput()).containsInAnyOrder(want); + PAssert.that(result.getErrors()).empty(); + + pipeline.run(); + } + + @Test + public void givenCustomParsingError_emits() { + PCollection records = + csvRecords(pipeline, "instant,instantList", "2024-01-23T10:00:05.000Z,BAD CELL"); + CsvIOParse underTest = + underTest( + TIME_CONTAINING_SCHEMA, + CSVFormat.DEFAULT + .withHeader("instant", "instantList") + .withAllowDuplicateHeaderNames(false), + new HashMap<>(), + timeContainingFromRowFn(), + TIME_CONTAINING_CODER) + .withCustomRecordParsing("instantList", instantListParsingLambda()); + + CsvIOParseResult result = records.apply(underTest); + PAssert.that(result.getOutput()).empty(); + PAssert.thatSingleton(result.getErrors().apply(Count.globally())).isEqualTo(1L); + + pipeline.run(); + } + private static CSVFormat csvFormat() { return CSVFormat.DEFAULT .withAllowDuplicateHeaderNames(false) @@ -191,7 +301,16 @@ private static CsvIOParse underTest( return CsvIOParse.builder().setConfigBuilder(configBuilder).build(); } - private static Map> emptyCustomProcessingMap() { - return new HashMap<>(); + private static SerializableFunction> instantListParsingLambda() { + return input -> { + Iterable cells = Splitter.on(';').split(input); + ; + List output = new ArrayList<>(); + for (String cell : cells) { + output.add( + DateTimeFormat.forPattern("HH-mm-ss-yyyy-MM-dd").parseDateTime(cell).toInstant()); + } + return output; + }; } } From 488996913ff9b2edf83f855e7d0050075ac1b39d Mon Sep 17 00:00:00 2001 From: jonathan-lemos Date: Fri, 9 Aug 2024 21:21:44 -0400 Subject: [PATCH 59/78] Add support for setting an HTTP read timeout for BigQueryIO (#32118) Shamelessly stolen from https://github.com/apache/beam/pull/7097 --- .../apache/beam/sdk/io/gcp/bigquery/BigQueryOptions.java | 9 ++++++++- .../beam/sdk/io/gcp/bigquery/BigQueryServicesImpl.java | 1 + 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryOptions.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryOptions.java index ba76f483f774b..faa252e79b2d6 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryOptions.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryOptions.java @@ -39,7 +39,14 @@ public interface BigQueryOptions void setTempDatasetId(String value); @Description( - "Timeout for HTTP requests to BigQuery service in milliseconds. Set to 0 to disable.") + "Timeout for HTTP read requests to BigQuery service in milliseconds. Set to 0 to disable.") + @Default.Integer(80 * 1000) + Integer getHTTPReadTimeout(); + + void setHTTPReadTimeout(Integer timeout); + + @Description( + "Timeout for HTTP write requests to BigQuery service in milliseconds. Set to 0 to disable.") @Default.Integer(900 * 1000) Integer getHTTPWriteTimeout(); diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryServicesImpl.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryServicesImpl.java index b87b6a222a4d7..115875c59411c 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryServicesImpl.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryServicesImpl.java @@ -1584,6 +1584,7 @@ private static Bigquery.Builder newBigQueryClient(BigQueryOptions options) { RetryHttpRequestInitializer httpRequestInitializer = new RetryHttpRequestInitializer(ImmutableList.of(404)); httpRequestInitializer.setCustomErrors(createBigQueryClientCustomErrors()); + httpRequestInitializer.setReadTimeout(options.getHTTPReadTimeout()); httpRequestInitializer.setWriteTimeout(options.getHTTPWriteTimeout()); ImmutableList.Builder initBuilder = ImmutableList.builder(); Credentials credential = options.getGcpCredential(); From 2f93d8bc19917f83d15f531bcbbfb7f36e21ff88 Mon Sep 17 00:00:00 2001 From: Hyeonho Kim Date: Sun, 11 Aug 2024 04:13:52 +0900 Subject: [PATCH 60/78] fix: cover bigquery datetime fraction 1 to 6 or absent (#32146) --- .../sdk/io/gcp/bigquery/BigQueryUtils.java | 4 +- .../io/gcp/bigquery/BigQueryUtilsTest.java | 103 +++++++++++++++++- 2 files changed, 100 insertions(+), 7 deletions(-) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryUtils.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryUtils.java index 6b60b138b4fda..305abad5783aa 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryUtils.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryUtils.java @@ -175,7 +175,7 @@ public abstract static class Builder { /** * Native BigQuery formatter for it's timestamp format, depending on the milliseconds stored in - * the column, the milli second part will be 6, 3 or absent. Example {@code 2019-08-16 + * the column, the milli second part will be 6 to 1 or absent. Example {@code 2019-08-16 * 00:52:07[.123]|[.123456] UTC} */ private static final DateTimeFormatter BIGQUERY_TIMESTAMP_PARSER; @@ -202,7 +202,7 @@ public abstract static class Builder { .appendOptional( new DateTimeFormatterBuilder() .appendLiteral('.') - .appendFractionOfSecond(3, 6) + .appendFractionOfSecond(1, 6) .toParser()) .appendLiteral(" UTC") .toFormatter() diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryUtilsTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryUtilsTest.java index 8a3ad16e190d4..e13e4a92a4dc4 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryUtilsTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryUtilsTest.java @@ -77,6 +77,10 @@ public class BigQueryUtilsTest { .addNullableField("timestamp_variant2", Schema.FieldType.DATETIME) .addNullableField("timestamp_variant3", Schema.FieldType.DATETIME) .addNullableField("timestamp_variant4", Schema.FieldType.DATETIME) + .addNullableField("timestamp_variant5", Schema.FieldType.DATETIME) + .addNullableField("timestamp_variant6", Schema.FieldType.DATETIME) + .addNullableField("timestamp_variant7", Schema.FieldType.DATETIME) + .addNullableField("timestamp_variant8", Schema.FieldType.DATETIME) .addNullableField("datetime", Schema.FieldType.logicalType(SqlTypes.DATETIME)) .addNullableField("datetime0ms", Schema.FieldType.logicalType(SqlTypes.DATETIME)) .addNullableField("datetime0s_ns", Schema.FieldType.logicalType(SqlTypes.DATETIME)) @@ -150,6 +154,22 @@ public class BigQueryUtilsTest { new TableFieldSchema() .setName("timestamp_variant4") .setType(StandardSQLTypeName.TIMESTAMP.toString()); + private static final TableFieldSchema TIMESTAMP_VARIANT5 = + new TableFieldSchema() + .setName("timestamp_variant5") + .setType(StandardSQLTypeName.TIMESTAMP.toString()); + private static final TableFieldSchema TIMESTAMP_VARIANT6 = + new TableFieldSchema() + .setName("timestamp_variant6") + .setType(StandardSQLTypeName.TIMESTAMP.toString()); + private static final TableFieldSchema TIMESTAMP_VARIANT7 = + new TableFieldSchema() + .setName("timestamp_variant7") + .setType(StandardSQLTypeName.TIMESTAMP.toString()); + private static final TableFieldSchema TIMESTAMP_VARIANT8 = + new TableFieldSchema() + .setName("timestamp_variant8") + .setType(StandardSQLTypeName.TIMESTAMP.toString()); private static final TableFieldSchema DATETIME = new TableFieldSchema().setName("datetime").setType(StandardSQLTypeName.DATETIME.toString()); @@ -240,6 +260,10 @@ public class BigQueryUtilsTest { TIMESTAMP_VARIANT2, TIMESTAMP_VARIANT3, TIMESTAMP_VARIANT4, + TIMESTAMP_VARIANT5, + TIMESTAMP_VARIANT6, + TIMESTAMP_VARIANT7, + TIMESTAMP_VARIANT8, DATETIME, DATETIME_0MS, DATETIME_0S_NS, @@ -271,6 +295,10 @@ public class BigQueryUtilsTest { TIMESTAMP_VARIANT2, TIMESTAMP_VARIANT3, TIMESTAMP_VARIANT4, + TIMESTAMP_VARIANT5, + TIMESTAMP_VARIANT6, + TIMESTAMP_VARIANT7, + TIMESTAMP_VARIANT8, DATETIME, DATETIME_0MS, DATETIME_0S_NS, @@ -312,6 +340,18 @@ public class BigQueryUtilsTest { .withZoneUTC() .parseDateTime("2019-08-18T15:52:07.123"), new DateTime(123456), + ISODateTimeFormat.dateHourMinuteSecondFraction() + .withZoneUTC() + .parseDateTime("2024-08-10T16:52:07.1"), + ISODateTimeFormat.dateHourMinuteSecondFraction() + .withZoneUTC() + .parseDateTime("2024-08-10T16:52:07.12"), + ISODateTimeFormat.dateHourMinuteSecondFraction() + .withZoneUTC() + .parseDateTime("2024-08-10T16:52:07.1234"), + ISODateTimeFormat.dateHourMinuteSecondFraction() + .withZoneUTC() + .parseDateTime("2024-08-10T16:52:07.12345"), LocalDateTime.parse("2020-11-02T12:34:56.789876"), LocalDateTime.parse("2020-11-02T12:34:56"), LocalDateTime.parse("2020-11-02T12:34:00.789876"), @@ -343,6 +383,11 @@ public class BigQueryUtilsTest { "timestamp_variant4", String.valueOf( new DateTime(123456L, ISOChronology.getInstanceUTC()).getMillis() / 1000.0D)) + .set("timestamp_variant5", "2024-08-10 16:52:07.1 UTC") + .set("timestamp_variant6", "2024-08-10 16:52:07.12 UTC") + // we'll loose precession, but it's something BigQuery can output! + .set("timestamp_variant7", "2024-08-10 16:52:07.1234 UTC") + .set("timestamp_variant8", "2024-08-10 16:52:07.12345 UTC") .set("datetime", "2020-11-02T12:34:56.789876") .set("datetime0ms", "2020-11-02T12:34:56") .set("datetime0s_ns", "2020-11-02T12:34:00.789876") @@ -364,7 +409,7 @@ public class BigQueryUtilsTest { Row.withSchema(FLAT_TYPE) .addValues( null, null, null, null, null, null, null, null, null, null, null, null, null, null, - null, null, null, null, null, null, null, null, null) + null, null, null, null, null, null, null, null, null, null, null, null, null) .build(); private static final TableRow BQ_NULL_FLAT_ROW = @@ -376,6 +421,10 @@ public class BigQueryUtilsTest { .set("timestamp_variant2", null) .set("timestamp_variant3", null) .set("timestamp_variant4", null) + .set("timestamp_variant5", null) + .set("timestamp_variant6", null) + .set("timestamp_variant7", null) + .set("timestamp_variant8", null) .set("datetime", null) .set("datetime0ms", null) .set("datetime0s_ns", null) @@ -459,6 +508,10 @@ public class BigQueryUtilsTest { TIMESTAMP_VARIANT2, TIMESTAMP_VARIANT3, TIMESTAMP_VARIANT4, + TIMESTAMP_VARIANT5, + TIMESTAMP_VARIANT6, + TIMESTAMP_VARIANT7, + TIMESTAMP_VARIANT8, DATETIME, DATETIME_0MS, DATETIME_0S_NS, @@ -515,6 +568,10 @@ public void testToTableSchema_flat() { TIMESTAMP_VARIANT2, TIMESTAMP_VARIANT3, TIMESTAMP_VARIANT4, + TIMESTAMP_VARIANT5, + TIMESTAMP_VARIANT6, + TIMESTAMP_VARIANT7, + TIMESTAMP_VARIANT8, DATETIME, DATETIME_0MS, DATETIME_0S_NS, @@ -566,6 +623,10 @@ public void testToTableSchema_row() { TIMESTAMP_VARIANT2, TIMESTAMP_VARIANT3, TIMESTAMP_VARIANT4, + TIMESTAMP_VARIANT5, + TIMESTAMP_VARIANT6, + TIMESTAMP_VARIANT7, + TIMESTAMP_VARIANT8, DATETIME, DATETIME_0MS, DATETIME_0S_NS, @@ -603,6 +664,10 @@ public void testToTableSchema_array_row() { TIMESTAMP_VARIANT2, TIMESTAMP_VARIANT3, TIMESTAMP_VARIANT4, + TIMESTAMP_VARIANT5, + TIMESTAMP_VARIANT6, + TIMESTAMP_VARIANT7, + TIMESTAMP_VARIANT8, DATETIME, DATETIME_0MS, DATETIME_0S_NS, @@ -637,9 +702,17 @@ public void testToTableSchema_map() { public void testToTableRow_flat() { TableRow row = toTableRow().apply(FLAT_ROW); - assertThat(row.size(), equalTo(23)); + assertThat(row.size(), equalTo(27)); assertThat(row, hasEntry("id", "123")); assertThat(row, hasEntry("value", "123.456")); + assertThat(row, hasEntry("timestamp_variant1", "2019-08-16 13:52:07.000 UTC")); + assertThat(row, hasEntry("timestamp_variant2", "2019-08-17 14:52:07.123 UTC")); + assertThat(row, hasEntry("timestamp_variant3", "2019-08-18 15:52:07.123 UTC")); + assertThat(row, hasEntry("timestamp_variant4", "1970-01-01 00:02:03.456 UTC")); + assertThat(row, hasEntry("timestamp_variant5", "2024-08-10 16:52:07.100 UTC")); + assertThat(row, hasEntry("timestamp_variant6", "2024-08-10 16:52:07.120 UTC")); + assertThat(row, hasEntry("timestamp_variant7", "2024-08-10 16:52:07.123 UTC")); + assertThat(row, hasEntry("timestamp_variant8", "2024-08-10 16:52:07.123 UTC")); assertThat(row, hasEntry("datetime", "2020-11-02T12:34:56.789876")); assertThat(row, hasEntry("datetime0ms", "2020-11-02T12:34:56")); assertThat(row, hasEntry("datetime0s_ns", "2020-11-02T12:34:00.789876")); @@ -692,9 +765,17 @@ public void testToTableRow_row() { assertThat(row.size(), equalTo(1)); row = (TableRow) row.get("row"); - assertThat(row.size(), equalTo(23)); + assertThat(row.size(), equalTo(27)); assertThat(row, hasEntry("id", "123")); assertThat(row, hasEntry("value", "123.456")); + assertThat(row, hasEntry("timestamp_variant1", "2019-08-16 13:52:07.000 UTC")); + assertThat(row, hasEntry("timestamp_variant2", "2019-08-17 14:52:07.123 UTC")); + assertThat(row, hasEntry("timestamp_variant3", "2019-08-18 15:52:07.123 UTC")); + assertThat(row, hasEntry("timestamp_variant4", "1970-01-01 00:02:03.456 UTC")); + assertThat(row, hasEntry("timestamp_variant5", "2024-08-10 16:52:07.100 UTC")); + assertThat(row, hasEntry("timestamp_variant6", "2024-08-10 16:52:07.120 UTC")); + assertThat(row, hasEntry("timestamp_variant7", "2024-08-10 16:52:07.123 UTC")); + assertThat(row, hasEntry("timestamp_variant8", "2024-08-10 16:52:07.123 UTC")); assertThat(row, hasEntry("datetime", "2020-11-02T12:34:56.789876")); assertThat(row, hasEntry("datetime0ms", "2020-11-02T12:34:56")); assertThat(row, hasEntry("datetime0s_ns", "2020-11-02T12:34:00.789876")); @@ -720,9 +801,17 @@ public void testToTableRow_array_row() { assertThat(row.size(), equalTo(1)); row = ((List) row.get("rows")).get(0); - assertThat(row.size(), equalTo(23)); + assertThat(row.size(), equalTo(27)); assertThat(row, hasEntry("id", "123")); assertThat(row, hasEntry("value", "123.456")); + assertThat(row, hasEntry("timestamp_variant1", "2019-08-16 13:52:07.000 UTC")); + assertThat(row, hasEntry("timestamp_variant2", "2019-08-17 14:52:07.123 UTC")); + assertThat(row, hasEntry("timestamp_variant3", "2019-08-18 15:52:07.123 UTC")); + assertThat(row, hasEntry("timestamp_variant4", "1970-01-01 00:02:03.456 UTC")); + assertThat(row, hasEntry("timestamp_variant5", "2024-08-10 16:52:07.100 UTC")); + assertThat(row, hasEntry("timestamp_variant6", "2024-08-10 16:52:07.120 UTC")); + assertThat(row, hasEntry("timestamp_variant7", "2024-08-10 16:52:07.123 UTC")); + assertThat(row, hasEntry("timestamp_variant8", "2024-08-10 16:52:07.123 UTC")); assertThat(row, hasEntry("datetime", "2020-11-02T12:34:56.789876")); assertThat(row, hasEntry("datetime0ms", "2020-11-02T12:34:56")); assertThat(row, hasEntry("datetime0s_ns", "2020-11-02T12:34:00.789876")); @@ -746,7 +835,7 @@ public void testToTableRow_array_row() { public void testToTableRow_null_row() { TableRow row = toTableRow().apply(NULL_FLAT_ROW); - assertThat(row.size(), equalTo(23)); + assertThat(row.size(), equalTo(27)); assertThat(row, hasEntry("id", null)); assertThat(row, hasEntry("value", null)); assertThat(row, hasEntry("name", null)); @@ -754,6 +843,10 @@ public void testToTableRow_null_row() { assertThat(row, hasEntry("timestamp_variant2", null)); assertThat(row, hasEntry("timestamp_variant3", null)); assertThat(row, hasEntry("timestamp_variant4", null)); + assertThat(row, hasEntry("timestamp_variant5", null)); + assertThat(row, hasEntry("timestamp_variant6", null)); + assertThat(row, hasEntry("timestamp_variant7", null)); + assertThat(row, hasEntry("timestamp_variant8", null)); assertThat(row, hasEntry("datetime", null)); assertThat(row, hasEntry("datetime0ms", null)); assertThat(row, hasEntry("datetime0s_ns", null)); From 780eef98083fe56f81cc5c62dc8ff193993584f0 Mon Sep 17 00:00:00 2001 From: twosom <72733442+twosom@users.noreply.github.com> Date: Mon, 12 Aug 2024 22:35:35 +0900 Subject: [PATCH 61/78] Replace StateTag.StateBinder to top level StateBinder in SparkStateInternals (#31798) --- ...PostCommit_Java_ValidatesRunner_Spark.json | 3 +- ...idatesRunner_SparkStructuredStreaming.json | 3 +- ...mit_Java_ValidatesRunner_Spark_Java11.json | 3 +- .../apache/beam/runners/core/StateTag.java | 5 +- .../spark/stateful/SparkStateInternals.java | 108 +++++++++--------- 5 files changed, 62 insertions(+), 60 deletions(-) diff --git a/.github/trigger_files/beam_PostCommit_Java_ValidatesRunner_Spark.json b/.github/trigger_files/beam_PostCommit_Java_ValidatesRunner_Spark.json index b970762c83970..d59e273949da9 100644 --- a/.github/trigger_files/beam_PostCommit_Java_ValidatesRunner_Spark.json +++ b/.github/trigger_files/beam_PostCommit_Java_ValidatesRunner_Spark.json @@ -1,4 +1,5 @@ { "comment": "Modify this file in a trivial way to cause this test suite to run", - "https://github.com/apache/beam/pull/31156": "noting that PR #31156 should run this test" + "https://github.com/apache/beam/pull/31156": "noting that PR #31156 should run this test", + "https://github.com/apache/beam/pull/31798": "noting that PR #31798 should run this test" } diff --git a/.github/trigger_files/beam_PostCommit_Java_ValidatesRunner_SparkStructuredStreaming.json b/.github/trigger_files/beam_PostCommit_Java_ValidatesRunner_SparkStructuredStreaming.json index b970762c83970..d59e273949da9 100644 --- a/.github/trigger_files/beam_PostCommit_Java_ValidatesRunner_SparkStructuredStreaming.json +++ b/.github/trigger_files/beam_PostCommit_Java_ValidatesRunner_SparkStructuredStreaming.json @@ -1,4 +1,5 @@ { "comment": "Modify this file in a trivial way to cause this test suite to run", - "https://github.com/apache/beam/pull/31156": "noting that PR #31156 should run this test" + "https://github.com/apache/beam/pull/31156": "noting that PR #31156 should run this test", + "https://github.com/apache/beam/pull/31798": "noting that PR #31798 should run this test" } diff --git a/.github/trigger_files/beam_PostCommit_Java_ValidatesRunner_Spark_Java11.json b/.github/trigger_files/beam_PostCommit_Java_ValidatesRunner_Spark_Java11.json index b970762c83970..d59e273949da9 100644 --- a/.github/trigger_files/beam_PostCommit_Java_ValidatesRunner_Spark_Java11.json +++ b/.github/trigger_files/beam_PostCommit_Java_ValidatesRunner_Spark_Java11.json @@ -1,4 +1,5 @@ { "comment": "Modify this file in a trivial way to cause this test suite to run", - "https://github.com/apache/beam/pull/31156": "noting that PR #31156 should run this test" + "https://github.com/apache/beam/pull/31156": "noting that PR #31156 should run this test", + "https://github.com/apache/beam/pull/31798": "noting that PR #31798 should run this test" } diff --git a/runners/core-java/src/main/java/org/apache/beam/runners/core/StateTag.java b/runners/core-java/src/main/java/org/apache/beam/runners/core/StateTag.java index 8c699ac311179..0106f95ed7486 100644 --- a/runners/core-java/src/main/java/org/apache/beam/runners/core/StateTag.java +++ b/runners/core-java/src/main/java/org/apache/beam/runners/core/StateTag.java @@ -69,8 +69,9 @@ public interface StateTag extends Serializable { /** * Visitor for binding a {@link StateSpec} and to the associated {@link State}. * - * @deprecated for migration only; runners should reference the top level {@link StateBinder} and - * move towards {@link StateSpec} rather than {@link StateTag}. + * @deprecated for migration only; runners should reference the top level {@link + * org.apache.beam.sdk.state.StateBinder} and move towards {@link StateSpec} rather than + * {@link StateTag}. */ @Deprecated public interface StateBinder { diff --git a/runners/spark/src/main/java/org/apache/beam/runners/spark/stateful/SparkStateInternals.java b/runners/spark/src/main/java/org/apache/beam/runners/spark/stateful/SparkStateInternals.java index 731cadb89f0c2..7ca0dc29e615d 100644 --- a/runners/spark/src/main/java/org/apache/beam/runners/spark/stateful/SparkStateInternals.java +++ b/runners/spark/src/main/java/org/apache/beam/runners/spark/stateful/SparkStateInternals.java @@ -27,7 +27,6 @@ import org.apache.beam.runners.core.StateInternals; import org.apache.beam.runners.core.StateNamespace; import org.apache.beam.runners.core.StateTag; -import org.apache.beam.runners.core.StateTag.StateBinder; import org.apache.beam.runners.spark.coders.CoderHelpers; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.coders.InstantCoder; @@ -42,11 +41,13 @@ import org.apache.beam.sdk.state.ReadableStates; import org.apache.beam.sdk.state.SetState; import org.apache.beam.sdk.state.State; +import org.apache.beam.sdk.state.StateBinder; import org.apache.beam.sdk.state.StateContext; +import org.apache.beam.sdk.state.StateSpec; import org.apache.beam.sdk.state.ValueState; import org.apache.beam.sdk.state.WatermarkHoldState; import org.apache.beam.sdk.transforms.Combine.CombineFn; -import org.apache.beam.sdk.transforms.CombineWithContext.CombineFnWithContext; +import org.apache.beam.sdk.transforms.CombineWithContext; import org.apache.beam.sdk.transforms.windowing.TimestampCombiner; import org.apache.beam.sdk.util.CombineFnUtil; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.HashBasedTable; @@ -96,45 +97,47 @@ public K getKey() { @Override public T state( StateNamespace namespace, StateTag address, StateContext c) { - return address.bind(new SparkStateBinder(namespace, c)); + return address.getSpec().bind(address.getId(), new SparkStateBinder(namespace, c)); } private class SparkStateBinder implements StateBinder { private final StateNamespace namespace; - private final StateContext c; + private final StateContext stateContext; - private SparkStateBinder(StateNamespace namespace, StateContext c) { + private SparkStateBinder(StateNamespace namespace, StateContext stateContext) { this.namespace = namespace; - this.c = c; + this.stateContext = stateContext; } @Override - public ValueState bindValue(StateTag> address, Coder coder) { - return new SparkValueState<>(namespace, address, coder); + public ValueState bindValue(String id, StateSpec> spec, Coder coder) { + return new SparkValueState<>(namespace, id, coder); } @Override - public BagState bindBag(StateTag> address, Coder elemCoder) { - return new SparkBagState<>(namespace, address, elemCoder); + public BagState bindBag(String id, StateSpec> spec, Coder elemCoder) { + return new SparkBagState<>(namespace, id, elemCoder); } @Override - public SetState bindSet(StateTag> spec, Coder elemCoder) { + public SetState bindSet(String id, StateSpec> spec, Coder elemCoder) { throw new UnsupportedOperationException( String.format("%s is not supported", SetState.class.getSimpleName())); } @Override public MapState bindMap( - StateTag> address, + String id, + StateSpec> spec, Coder mapKeyCoder, Coder mapValueCoder) { - return new SparkMapState<>(namespace, address, MapCoder.of(mapKeyCoder, mapValueCoder)); + return new SparkMapState<>(namespace, id, MapCoder.of(mapKeyCoder, mapValueCoder)); } @Override public MultimapState bindMultimap( - StateTag> spec, + String id, + StateSpec> spec, Coder keyCoder, Coder valueCoder) { throw new UnsupportedOperationException( @@ -143,50 +146,51 @@ public MultimapState bindMultimap( @Override public OrderedListState bindOrderedList( - StateTag> spec, Coder elemCoder) { + String id, StateSpec> spec, Coder elemCoder) { throw new UnsupportedOperationException( String.format("%s is not supported", OrderedListState.class.getSimpleName())); } @Override - public CombiningState bindCombiningValue( - StateTag> address, + public CombiningState bindCombining( + String id, + StateSpec> spec, Coder accumCoder, CombineFn combineFn) { - return new SparkCombiningState<>(namespace, address, accumCoder, combineFn); + return new SparkCombiningState<>(namespace, id, accumCoder, combineFn); } @Override public - CombiningState bindCombiningValueWithContext( - StateTag> address, + CombiningState bindCombiningWithContext( + String id, + StateSpec> spec, Coder accumCoder, - CombineFnWithContext combineFn) { + CombineWithContext.CombineFnWithContext combineFn) { return new SparkCombiningState<>( - namespace, address, accumCoder, CombineFnUtil.bindContext(combineFn, c)); + namespace, id, accumCoder, CombineFnUtil.bindContext(combineFn, stateContext)); } @Override public WatermarkHoldState bindWatermark( - StateTag address, TimestampCombiner timestampCombiner) { - return new SparkWatermarkHoldState(namespace, address, timestampCombiner); + String id, StateSpec spec, TimestampCombiner timestampCombiner) { + return new SparkWatermarkHoldState(namespace, id, timestampCombiner); } } private class AbstractState { final StateNamespace namespace; - final StateTag address; + final String id; final Coder coder; - private AbstractState( - StateNamespace namespace, StateTag address, Coder coder) { + private AbstractState(StateNamespace namespace, String id, Coder coder) { this.namespace = namespace; - this.address = address; + this.id = id; this.coder = coder; } T readValue() { - byte[] buf = stateTable.get(namespace.stringKey(), address.getId()); + byte[] buf = stateTable.get(namespace.stringKey(), id); if (buf != null) { return CoderHelpers.fromByteArray(buf, coder); } @@ -194,12 +198,11 @@ T readValue() { } void writeValue(T input) { - stateTable.put( - namespace.stringKey(), address.getId(), CoderHelpers.toByteArray(input, coder)); + stateTable.put(namespace.stringKey(), id, CoderHelpers.toByteArray(input, coder)); } public void clear() { - stateTable.remove(namespace.stringKey(), address.getId()); + stateTable.remove(namespace.stringKey(), id); } @Override @@ -212,22 +215,21 @@ public boolean equals(@Nullable Object o) { } @SuppressWarnings("unchecked") AbstractState that = (AbstractState) o; - return namespace.equals(that.namespace) && address.equals(that.address); + return namespace.equals(that.namespace) && id.equals(that.id); } @Override public int hashCode() { int result = namespace.hashCode(); - result = 31 * result + address.hashCode(); + result = 31 * result + id.hashCode(); return result; } } private class SparkValueState extends AbstractState implements ValueState { - private SparkValueState( - StateNamespace namespace, StateTag> address, Coder coder) { - super(namespace, address, coder); + private SparkValueState(StateNamespace namespace, String id, Coder coder) { + super(namespace, id, coder); } @Override @@ -252,10 +254,8 @@ private class SparkWatermarkHoldState extends AbstractState private final TimestampCombiner timestampCombiner; SparkWatermarkHoldState( - StateNamespace namespace, - StateTag address, - TimestampCombiner timestampCombiner) { - super(namespace, address, InstantCoder.of()); + StateNamespace namespace, String id, TimestampCombiner timestampCombiner) { + super(namespace, id, InstantCoder.of()); this.timestampCombiner = timestampCombiner; } @@ -287,7 +287,7 @@ public ReadableState readLater() { @Override public Boolean read() { - return stateTable.get(namespace.stringKey(), address.getId()) == null; + return stateTable.get(namespace.stringKey(), id) == null; } }; } @@ -299,22 +299,22 @@ public TimestampCombiner getTimestampCombiner() { } @SuppressWarnings("TypeParameterShadowing") - private class SparkCombiningState extends AbstractState + private class SparkCombiningState extends AbstractState implements CombiningState { private final CombineFn combineFn; private SparkCombiningState( StateNamespace namespace, - StateTag> address, + String id, Coder coder, CombineFn combineFn) { - super(namespace, address, coder); + super(namespace, id, coder); this.combineFn = combineFn; } @Override - public SparkCombiningState readLater() { + public SparkCombiningState readLater() { return this; } @@ -348,7 +348,7 @@ public ReadableState readLater() { @Override public Boolean read() { - return stateTable.get(namespace.stringKey(), address.getId()) == null; + return stateTable.get(namespace.stringKey(), id) == null; } }; } @@ -369,10 +369,8 @@ private final class SparkMapState extends AbstractState> implements MapState { private SparkMapState( - StateNamespace namespace, - StateTag address, - Coder> coder) { - super(namespace, address, coder); + StateNamespace namespace, String id, Coder> coder) { + super(namespace, id, coder); } @Override @@ -490,7 +488,7 @@ public ReadableState isEmpty() { return new ReadableState() { @Override public Boolean read() { - return stateTable.get(namespace.stringKey(), address.getId()) == null; + return stateTable.get(namespace.stringKey(), id) == null; } @Override @@ -502,8 +500,8 @@ public ReadableState readLater() { } private final class SparkBagState extends AbstractState> implements BagState { - private SparkBagState(StateNamespace namespace, StateTag> address, Coder coder) { - super(namespace, address, ListCoder.of(coder)); + private SparkBagState(StateNamespace namespace, String id, Coder coder) { + super(namespace, id, ListCoder.of(coder)); } @Override @@ -537,7 +535,7 @@ public ReadableState readLater() { @Override public Boolean read() { - return stateTable.get(namespace.stringKey(), address.getId()) == null; + return stateTable.get(namespace.stringKey(), id) == null; } }; } From ab4ada4ff404951daa3903b5a96a051d396c4e7a Mon Sep 17 00:00:00 2001 From: Yi Hu Date: Mon, 12 Aug 2024 10:12:23 -0400 Subject: [PATCH 62/78] Skip most bigtableIO write error handling test on Dataflow runner (#32048) --- runners/google-cloud-dataflow-java/build.gradle | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/runners/google-cloud-dataflow-java/build.gradle b/runners/google-cloud-dataflow-java/build.gradle index 5d898bb57d86b..55f0074b9f314 100644 --- a/runners/google-cloud-dataflow-java/build.gradle +++ b/runners/google-cloud-dataflow-java/build.gradle @@ -573,6 +573,13 @@ task googleCloudPlatformLegacyWorkerIntegrationTest(type: Test, dependsOn: copyG testClassesDirs = files(project(":sdks:java:io:google-cloud-platform").sourceSets.test.output.classesDirs) useJUnit { excludeCategories "org.apache.beam.sdk.testing.UsesKms" + filter { + // Only needs to run on direct runner + excludeTestsMatching 'org.apache.beam.sdk.io.gcp.bigtable.BigtableWriteIT.testE2EBigtableWriteWithEmptyMutationFailures' + excludeTestsMatching 'org.apache.beam.sdk.io.gcp.bigtable.BigtableWriteIT.testE2EBigtableWriteWithEmptyRowFailures' + excludeTestsMatching 'org.apache.beam.sdk.io.gcp.bigtable.BigtableWriteIT.testE2EBigtableWriteWithInvalidTimestampFailures' + excludeTestsMatching 'org.apache.beam.sdk.io.gcp.bigtable.BigtableWriteIT.testE2EBigtableWriteWithOversizedQualifierFailures' + } } } @@ -623,7 +630,15 @@ task googleCloudPlatformRunnerV2IntegrationTest(type: Test) { maxParallelForks 4 classpath = configurations.googleCloudPlatformIntegrationTest testClassesDirs = files(project(":sdks:java:io:google-cloud-platform").sourceSets.test.output.classesDirs) - useJUnit { } + useJUnit { + filter { + // Only needs to run on direct runner + excludeTestsMatching 'org.apache.beam.sdk.io.gcp.bigtable.BigtableWriteIT.testE2EBigtableWriteWithEmptyMutationFailures' + excludeTestsMatching 'org.apache.beam.sdk.io.gcp.bigtable.BigtableWriteIT.testE2EBigtableWriteWithEmptyRowFailures' + excludeTestsMatching 'org.apache.beam.sdk.io.gcp.bigtable.BigtableWriteIT.testE2EBigtableWriteWithInvalidTimestampFailures' + excludeTestsMatching 'org.apache.beam.sdk.io.gcp.bigtable.BigtableWriteIT.testE2EBigtableWriteWithOversizedQualifierFailures' + } + } } task examplesJavaRunnerV2PreCommit(type: Test) { From 29de91383f3633b9a80c885a84726accc5fc6bf1 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 12 Aug 2024 10:16:52 -0400 Subject: [PATCH 63/78] Bump cloud.google.com/go/bigtable from 1.28.0 to 1.29.0 in /sdks (#32151) Bumps [cloud.google.com/go/bigtable](https://github.com/googleapis/google-cloud-go) from 1.28.0 to 1.29.0. - [Release notes](https://github.com/googleapis/google-cloud-go/releases) - [Changelog](https://github.com/googleapis/google-cloud-go/blob/main/documentai/CHANGES.md) - [Commits](https://github.com/googleapis/google-cloud-go/compare/pubsub/v1.28.0...pubsub/v1.29.0) --- updated-dependencies: - dependency-name: cloud.google.com/go/bigtable dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- sdks/go.mod | 22 +++++++++++----------- sdks/go.sum | 52 ++++++++++++++++++++++++++-------------------------- 2 files changed, 37 insertions(+), 37 deletions(-) diff --git a/sdks/go.mod b/sdks/go.mod index 624cc0ab1ce82..16b39cdbe3a78 100644 --- a/sdks/go.mod +++ b/sdks/go.mod @@ -24,7 +24,7 @@ go 1.21 require ( cloud.google.com/go/bigquery v1.62.0 - cloud.google.com/go/bigtable v1.28.0 + cloud.google.com/go/bigtable v1.29.0 cloud.google.com/go/datastore v1.17.1 cloud.google.com/go/profiler v0.4.1 cloud.google.com/go/pubsub v1.40.0 @@ -58,8 +58,8 @@ require ( golang.org/x/sync v0.8.0 golang.org/x/sys v0.23.0 golang.org/x/text v0.17.0 - google.golang.org/api v0.189.0 - google.golang.org/genproto v0.0.0-20240725223205-93522f1f2a9f + google.golang.org/api v0.191.0 + google.golang.org/genproto v0.0.0-20240730163845-b1a4ccb954bf google.golang.org/grpc v1.65.0 google.golang.org/protobuf v1.34.2 gopkg.in/yaml.v2 v2.4.0 @@ -74,9 +74,9 @@ require ( require ( cel.dev/expr v0.15.0 // indirect - cloud.google.com/go/auth v0.7.2 // indirect + cloud.google.com/go/auth v0.8.0 // indirect cloud.google.com/go/auth/oauth2adapt v0.2.3 // indirect - cloud.google.com/go/monitoring v1.20.2 // indirect + cloud.google.com/go/monitoring v1.20.3 // indirect dario.cat/mergo v1.0.0 // indirect filippo.io/edwards25519 v1.1.0 // indirect github.com/GoogleCloudPlatform/grpc-gcp-go/grpcgcp v1.5.0 // indirect @@ -110,14 +110,14 @@ require ( go.opentelemetry.io/otel/sdk v1.24.0 // indirect go.opentelemetry.io/otel/sdk/metric v1.24.0 // indirect go.opentelemetry.io/otel/trace v1.24.0 // indirect - golang.org/x/time v0.5.0 // indirect + golang.org/x/time v0.6.0 // indirect ) require ( cloud.google.com/go v0.115.0 // indirect cloud.google.com/go/compute/metadata v0.5.0 // indirect - cloud.google.com/go/iam v1.1.11 // indirect - cloud.google.com/go/longrunning v0.5.10 // indirect + cloud.google.com/go/iam v1.1.12 // indirect + cloud.google.com/go/longrunning v0.5.11 // indirect github.com/Azure/go-ansiterm v0.0.0-20210617225240-d185dfc1b5a1 // indirect github.com/Microsoft/go-winio v0.6.2 // indirect github.com/apache/arrow/go/arrow v0.0.0-20200730104253-651201b0f516 // indirect @@ -154,7 +154,7 @@ require ( github.com/google/flatbuffers v23.5.26+incompatible // indirect github.com/google/pprof v0.0.0-20240528025155-186aa0362fba // indirect github.com/google/renameio/v2 v2.0.0 // indirect - github.com/google/s2a-go v0.1.7 // indirect + github.com/google/s2a-go v0.1.8 // indirect github.com/googleapis/enterprise-certificate-proxy v0.3.2 // indirect github.com/googleapis/gax-go/v2 v2.13.0 // indirect github.com/gorilla/handlers v1.5.2 // indirect @@ -187,6 +187,6 @@ require ( golang.org/x/mod v0.18.0 // indirect golang.org/x/tools v0.22.0 // indirect golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20240722135656-d784300faade // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20240722135656-d784300faade // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20240725223205-93522f1f2a9f // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20240730163845-b1a4ccb954bf // indirect ) diff --git a/sdks/go.sum b/sdks/go.sum index 67686da8e408a..8b25c7c000d5d 100644 --- a/sdks/go.sum +++ b/sdks/go.sum @@ -101,8 +101,8 @@ cloud.google.com/go/assuredworkloads v1.7.0/go.mod h1:z/736/oNmtGAyU47reJgGN+KVo cloud.google.com/go/assuredworkloads v1.8.0/go.mod h1:AsX2cqyNCOvEQC8RMPnoc0yEarXQk6WEKkxYfL6kGIo= cloud.google.com/go/assuredworkloads v1.9.0/go.mod h1:kFuI1P78bplYtT77Tb1hi0FMxM0vVpRC7VVoJC3ZoT0= cloud.google.com/go/assuredworkloads v1.10.0/go.mod h1:kwdUQuXcedVdsIaKgKTp9t0UJkE5+PAVNhdQm4ZVq2E= -cloud.google.com/go/auth v0.7.2 h1:uiha352VrCDMXg+yoBtaD0tUF4Kv9vrtrWPYXwutnDE= -cloud.google.com/go/auth v0.7.2/go.mod h1:VEc4p5NNxycWQTMQEDQF0bd6aTMb6VgYDXEwiJJQAbs= +cloud.google.com/go/auth v0.8.0 h1:y8jUJLl/Fg+qNBWxP/Hox2ezJvjkrPb952PC1p0G6A4= +cloud.google.com/go/auth v0.8.0/go.mod h1:qGVp/Y3kDRSDZ5gFD/XPUfYQ9xW1iI7q8RIRoCyBbJc= cloud.google.com/go/auth/oauth2adapt v0.2.3 h1:MlxF+Pd3OmSudg/b1yZ5lJwoXCEaeedAguodky1PcKI= cloud.google.com/go/auth/oauth2adapt v0.2.3/go.mod h1:tMQXOfZzFuNuUxOypHlQEXgdfX5cuhwU+ffUuXRJE8I= cloud.google.com/go/automl v1.5.0/go.mod h1:34EjfoFGMZ5sgJ9EoLsRtdPSNZLcfflJR39VbVNS2M0= @@ -135,8 +135,8 @@ cloud.google.com/go/bigquery v1.49.0/go.mod h1:Sv8hMmTFFYBlt/ftw2uN6dFdQPzBlREY9 cloud.google.com/go/bigquery v1.50.0/go.mod h1:YrleYEh2pSEbgTBZYMJ5SuSr0ML3ypjRB1zgf7pvQLU= cloud.google.com/go/bigquery v1.62.0 h1:SYEA2f7fKqbSRRBHb7g0iHTtZvtPSPYdXfmqsjpsBwo= cloud.google.com/go/bigquery v1.62.0/go.mod h1:5ee+ZkF1x/ntgCsFQJAQTM3QkAZOecfCmvxhkJsWRSA= -cloud.google.com/go/bigtable v1.28.0 h1:c0wc/wy+9Chj8BooqW/zgaeslXsA5YEYl84VBmvwp+4= -cloud.google.com/go/bigtable v1.28.0/go.mod h1:avmXcmxVbLJAo9moICRYMgDyTTPoV0MA0lHKnyqV4fQ= +cloud.google.com/go/bigtable v1.29.0 h1:2CnFjKPwjpZMZdTi2RpppvxzD80zKzDYrLYEQw/NnAs= +cloud.google.com/go/bigtable v1.29.0/go.mod h1:5p909nNdWaNUcWs6KGZO8mI5HUovstlmrIi7+eA5PTQ= cloud.google.com/go/billing v1.4.0/go.mod h1:g9IdKBEFlItS8bTtlrZdVLWSSdSyFUZKXNS02zKMOZY= cloud.google.com/go/billing v1.5.0/go.mod h1:mztb1tBc3QekhjSgmpf/CV4LzWXLzCArwpLmP2Gm88s= cloud.google.com/go/billing v1.6.0/go.mod h1:WoXzguj+BeHXPbKfNWkqVtDdzORazmCjraY+vrxcyvI= @@ -210,8 +210,8 @@ cloud.google.com/go/datacatalog v1.8.0/go.mod h1:KYuoVOv9BM8EYz/4eMFxrr4DUKhGIOX cloud.google.com/go/datacatalog v1.8.1/go.mod h1:RJ58z4rMp3gvETA465Vg+ag8BGgBdnRPEMMSTr5Uv+M= cloud.google.com/go/datacatalog v1.12.0/go.mod h1:CWae8rFkfp6LzLumKOnmVh4+Zle4A3NXLzVJ1d1mRm0= cloud.google.com/go/datacatalog v1.13.0/go.mod h1:E4Rj9a5ZtAxcQJlEBTLgMTphfP11/lNaAshpoBgemX8= -cloud.google.com/go/datacatalog v1.20.4 h1:nUR7JBPZezl1+o+86N01VxAQQHY+It/D8tmNipcdVjI= -cloud.google.com/go/datacatalog v1.20.4/go.mod h1:71PDwywIYkNgSXdUU3H0mkTp3j15aahfYJ1CY3DogtU= +cloud.google.com/go/datacatalog v1.20.5 h1:Cosg/L60myEbpP1HoNv77ykV7zWe7hqSwY4uUDmhx/I= +cloud.google.com/go/datacatalog v1.20.5/go.mod h1:DB0QWF9nelpsbB0eR/tA0xbHZZMvpoFD1XFy3Qv/McI= cloud.google.com/go/dataflow v0.6.0/go.mod h1:9QwV89cGoxjjSR9/r7eFDqqjtvbKxAK2BaYU6PVk9UM= cloud.google.com/go/dataflow v0.7.0/go.mod h1:PX526vb4ijFMesO1o202EaUmouZKBpjHsTlCtB4parQ= cloud.google.com/go/dataflow v0.8.0/go.mod h1:Rcf5YgTKPtQyYz8bLYhFoIV/vP39eL7fWNcSOyFfLJE= @@ -327,8 +327,8 @@ cloud.google.com/go/iam v0.8.0/go.mod h1:lga0/y3iH6CX7sYqypWJ33hf7kkfXJag67naqGE cloud.google.com/go/iam v0.11.0/go.mod h1:9PiLDanza5D+oWFZiH1uG+RnRCfEGKoyl6yo4cgWZGY= cloud.google.com/go/iam v0.12.0/go.mod h1:knyHGviacl11zrtZUoDuYpDgLjvr28sLQaG0YB2GYAY= cloud.google.com/go/iam v0.13.0/go.mod h1:ljOg+rcNfzZ5d6f1nAUJ8ZIxOaZUVoS14bKCtaLZ/D0= -cloud.google.com/go/iam v1.1.11 h1:0mQ8UKSfdHLut6pH9FM3bI55KWR46ketn0PuXleDyxw= -cloud.google.com/go/iam v1.1.11/go.mod h1:biXoiLWYIKntto2joP+62sd9uW5EpkZmKIvfNcTWlnQ= +cloud.google.com/go/iam v1.1.12 h1:JixGLimRrNGcxvJEQ8+clfLxPlbeZA6MuRJ+qJNQ5Xw= +cloud.google.com/go/iam v1.1.12/go.mod h1:9LDX8J7dN5YRyzVHxwQzrQs9opFFqn0Mxs9nAeB+Hhg= cloud.google.com/go/iap v1.4.0/go.mod h1:RGFwRJdihTINIe4wZ2iCP0zF/qu18ZwyKxrhMhygBEc= cloud.google.com/go/iap v1.5.0/go.mod h1:UH/CGgKd4KyohZL5Pt0jSKE4m3FR51qg6FKQ/z/Ix9A= cloud.google.com/go/iap v1.6.0/go.mod h1:NSuvI9C/j7UdjGjIde7t7HBz+QTwBcapPE07+sSRcLk= @@ -348,8 +348,8 @@ cloud.google.com/go/kms v1.8.0/go.mod h1:4xFEhYFqvW+4VMELtZyxomGSYtSQKzM178ylFW4 cloud.google.com/go/kms v1.9.0/go.mod h1:qb1tPTgfF9RQP8e1wq4cLFErVuTJv7UsSC915J8dh3w= cloud.google.com/go/kms v1.10.0/go.mod h1:ng3KTUtQQU9bPX3+QGLsflZIHlkbn8amFAMY63m8d24= cloud.google.com/go/kms v1.10.1/go.mod h1:rIWk/TryCkR59GMC3YtHtXeLzd634lBbKenvyySAyYI= -cloud.google.com/go/kms v1.18.3 h1:8+Z2S4bQDSCdghB5ZA5dVDDJTLmnkRlowtFiXqMFd74= -cloud.google.com/go/kms v1.18.3/go.mod h1:y/Lcf6fyhbdn7MrG1VaDqXxM8rhOBc5rWcWAhcvZjQU= +cloud.google.com/go/kms v1.18.4 h1:dYN3OCsQ6wJLLtOnI8DGUwQ5shMusXsWCCC+s09ATsk= +cloud.google.com/go/kms v1.18.4/go.mod h1:SG1bgQ3UWW6/KdPo9uuJnzELXY5YTTMJtDYvajiQ22g= cloud.google.com/go/language v1.4.0/go.mod h1:F9dRpNFQmJbkaop6g0JhSBXCNlO90e1KWx5iDdxbWic= cloud.google.com/go/language v1.6.0/go.mod h1:6dJ8t3B+lUYfStgls25GusK04NLh3eDLQnWM3mdEbhI= cloud.google.com/go/language v1.7.0/go.mod h1:DJ6dYN/W+SQOjF8e1hLQXMF21AkH2w9wiPzPCJa2MIE= @@ -363,8 +363,8 @@ cloud.google.com/go/logging v1.7.0/go.mod h1:3xjP2CjkM3ZkO73aj4ASA5wRPGGCRrPIAeN cloud.google.com/go/longrunning v0.1.1/go.mod h1:UUFxuDWkv22EuY93jjmDMFT5GPQKeFVJBIF6QlTqdsE= cloud.google.com/go/longrunning v0.3.0/go.mod h1:qth9Y41RRSUE69rDcOn6DdK3HfQfsUI0YSmW3iIlLJc= cloud.google.com/go/longrunning v0.4.1/go.mod h1:4iWDqhBZ70CvZ6BfETbvam3T8FMvLK+eFj0E6AaRQTo= -cloud.google.com/go/longrunning v0.5.10 h1:eB/BniENNRKhjz/xgiillrdcH3G74TGSl3BXinGlI7E= -cloud.google.com/go/longrunning v0.5.10/go.mod h1:tljz5guTr5oc/qhlUjBlk7UAIFMOGuPNxkNDZXlLics= +cloud.google.com/go/longrunning v0.5.11 h1:Havn1kGjz3whCfoD8dxMLP73Ph5w+ODyZB9RUsDxtGk= +cloud.google.com/go/longrunning v0.5.11/go.mod h1:rDn7//lmlfWV1Dx6IB4RatCPenTwwmqXuiP0/RgoEO4= cloud.google.com/go/managedidentities v1.3.0/go.mod h1:UzlW3cBOiPrzucO5qWkNkh0w33KFtBJU281hacNvsdE= cloud.google.com/go/managedidentities v1.4.0/go.mod h1:NWSBYbEMgqmbZsLIyKvxrYbtqOsxY1ZrGM+9RgDqInM= cloud.google.com/go/managedidentities v1.5.0/go.mod h1:+dWcZ0JlUmpuxpIDfyP5pP5y0bLdRwOS4Lp7gMni/LA= @@ -388,8 +388,8 @@ cloud.google.com/go/monitoring v1.7.0/go.mod h1:HpYse6kkGo//7p6sT0wsIC6IBDET0RhI cloud.google.com/go/monitoring v1.8.0/go.mod h1:E7PtoMJ1kQXWxPjB6mv2fhC5/15jInuulFdYYtlcvT4= cloud.google.com/go/monitoring v1.12.0/go.mod h1:yx8Jj2fZNEkL/GYZyTLS4ZtZEZN8WtDEiEqG4kLK50w= cloud.google.com/go/monitoring v1.13.0/go.mod h1:k2yMBAB1H9JT/QETjNkgdCGD9bPF712XiLTVr+cBrpw= -cloud.google.com/go/monitoring v1.20.2 h1:B/L+xrw9PYO7ywh37sgnjI/6dzEE+yQTAwfytDcpPto= -cloud.google.com/go/monitoring v1.20.2/go.mod h1:36rpg/7fdQ7NX5pG5x1FA7cXTVXusOp6Zg9r9e1+oek= +cloud.google.com/go/monitoring v1.20.3 h1:v/7MXFxYrhXLEZ9sSfwXdlTLLB/xrU7xTyYjY5acynQ= +cloud.google.com/go/monitoring v1.20.3/go.mod h1:GPIVIdNznIdGqEjtRKQWTLcUeRnPjZW85szouimiczU= cloud.google.com/go/networkconnectivity v1.4.0/go.mod h1:nOl7YL8odKyAOtzNX73/M5/mGZgqqMeryi6UPZTk/rA= cloud.google.com/go/networkconnectivity v1.5.0/go.mod h1:3GzqJx7uhtlM3kln0+x5wyFvuVH1pIBJjhCpjzSt75o= cloud.google.com/go/networkconnectivity v1.6.0/go.mod h1:OJOoEXW+0LAxHh89nXd64uGG+FbQoeH8DtxCHVOMlaM= @@ -943,8 +943,8 @@ github.com/google/pprof v0.0.0-20240528025155-186aa0362fba/go.mod h1:K1liHPHnj73 github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI= github.com/google/renameio/v2 v2.0.0 h1:UifI23ZTGY8Tt29JbYFiuyIU3eX+RNFtUwefq9qAhxg= github.com/google/renameio/v2 v2.0.0/go.mod h1:BtmJXm5YlszgC+TD4HOEEUFgkJP3nLxehU6hfe7jRt4= -github.com/google/s2a-go v0.1.7 h1:60BLSyTrOV4/haCDW4zb1guZItoSq8foHCXrAnjBo/o= -github.com/google/s2a-go v0.1.7/go.mod h1:50CgR4k1jNlWBu4UfS4AcfhVe1r6pdZPygJ3R8F0Qdw= +github.com/google/s2a-go v0.1.8 h1:zZDs9gcbt9ZPLV0ndSyQk6Kacx2g/X+SKYovpnz3SMM= +github.com/google/s2a-go v0.1.8/go.mod h1:6iNWHTpQ+nfNRN5E00MSdfDwVesa8hhS32PhPO8deJA= github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.2.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= @@ -1543,8 +1543,8 @@ golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxb golang.org/x/time v0.0.0-20220922220347-f3bd1da661af/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.1.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.3.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= -golang.org/x/time v0.5.0 h1:o7cqy6amK/52YcAKIPlM3a+Fpj35zvRj2TP+e1xFSfk= -golang.org/x/time v0.5.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= +golang.org/x/time v0.6.0 h1:eTDhh4ZXt5Qf0augr54TN6suAUudPcawVZeIAPU7D4U= +golang.org/x/time v0.6.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= @@ -1688,8 +1688,8 @@ google.golang.org/api v0.108.0/go.mod h1:2Ts0XTHNVWxypznxWOYUeI4g3WdP9Pk2Qk58+a/ google.golang.org/api v0.110.0/go.mod h1:7FC4Vvx1Mooxh8C5HWjzZHcavuS2f6pmJpZx60ca7iI= google.golang.org/api v0.111.0/go.mod h1:qtFHvU9mhgTJegR31csQ+rwxyUTHOKFqCKWp1J0fdw0= google.golang.org/api v0.114.0/go.mod h1:ifYI2ZsFK6/uGddGfAD5BMxlnkBqCmqHSDUVi45N5Yg= -google.golang.org/api v0.189.0 h1:equMo30LypAkdkLMBqfeIqtyAnlyig1JSZArl4XPwdI= -google.golang.org/api v0.189.0/go.mod h1:FLWGJKb0hb+pU2j+rJqwbnsF+ym+fQs73rbJ+KAUgy8= +google.golang.org/api v0.191.0 h1:cJcF09Z+4HAB2t5qTQM1ZtfL/PemsLFkcFG67qq2afk= +google.golang.org/api v0.191.0/go.mod h1:tD5dsFGxFza0hnQveGfVk9QQYKcfp+VzgRqyXFxE0+E= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= @@ -1829,12 +1829,12 @@ google.golang.org/genproto v0.0.0-20230323212658-478b75c54725/go.mod h1:UUQDJDOl google.golang.org/genproto v0.0.0-20230330154414-c0448cd141ea/go.mod h1:UUQDJDOlWu4KYeJZffbWgBkS1YFobzKbLVfK69pe0Ak= google.golang.org/genproto v0.0.0-20230331144136-dcfb400f0633/go.mod h1:UUQDJDOlWu4KYeJZffbWgBkS1YFobzKbLVfK69pe0Ak= google.golang.org/genproto v0.0.0-20230410155749-daa745c078e1/go.mod h1:nKE/iIaLqn2bQwXBg8f1g2Ylh6r5MN5CmZvuzZCgsCU= -google.golang.org/genproto v0.0.0-20240725223205-93522f1f2a9f h1:htT2I9bZvGm+110zq8bIErMX+WgBWxCzV3ChwbvnKnc= -google.golang.org/genproto v0.0.0-20240725223205-93522f1f2a9f/go.mod h1:Sk3mLpoDFTAp6R4OvlcUgaG4ISTspKeFsIAXMn9Bm4Y= -google.golang.org/genproto/googleapis/api v0.0.0-20240722135656-d784300faade h1:WxZOF2yayUHpHSbUE6NMzumUzBxYc3YGwo0YHnbzsJY= -google.golang.org/genproto/googleapis/api v0.0.0-20240722135656-d784300faade/go.mod h1:mw8MG/Qz5wfgYr6VqVCiZcHe/GJEfI+oGGDCohaVgB0= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240722135656-d784300faade h1:oCRSWfwGXQsqlVdErcyTt4A93Y8fo0/9D4b1gnI++qo= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240722135656-d784300faade/go.mod h1:Ue6ibwXGpU+dqIcODieyLOcgj7z8+IcskoNIgZxtrFY= +google.golang.org/genproto v0.0.0-20240730163845-b1a4ccb954bf h1:OqdXDEakZCVtDiZTjcxfwbHPCT11ycCEsTKesBVKvyY= +google.golang.org/genproto v0.0.0-20240730163845-b1a4ccb954bf/go.mod h1:mCr1K1c8kX+1iSBREvU3Juo11CB+QOEWxbRS01wWl5M= +google.golang.org/genproto/googleapis/api v0.0.0-20240725223205-93522f1f2a9f h1:b1Ln/PG8orm0SsBbHZWke8dDp2lrCD4jSmfglFpTZbk= +google.golang.org/genproto/googleapis/api v0.0.0-20240725223205-93522f1f2a9f/go.mod h1:AHT0dDg3SoMOgZGnZk29b5xTbPHMoEC8qthmBLJCpys= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240730163845-b1a4ccb954bf h1:liao9UHurZLtiEwBgT9LMOnKYsHze6eA6w1KQCMVN2Q= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240730163845-b1a4ccb954bf/go.mod h1:Ue6ibwXGpU+dqIcODieyLOcgj7z8+IcskoNIgZxtrFY= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= From f4e43148118be30f7002bd00b5a8df44bd201a24 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 12 Aug 2024 10:17:20 -0400 Subject: [PATCH 64/78] Bump cloud.google.com/go/pubsub from 1.40.0 to 1.41.0 in /sdks (#32149) Bumps [cloud.google.com/go/pubsub](https://github.com/googleapis/google-cloud-go) from 1.40.0 to 1.41.0. - [Release notes](https://github.com/googleapis/google-cloud-go/releases) - [Changelog](https://github.com/googleapis/google-cloud-go/blob/main/CHANGES.md) - [Commits](https://github.com/googleapis/google-cloud-go/compare/pubsub/v1.40.0...pubsub/v1.41.0) --- updated-dependencies: - dependency-name: cloud.google.com/go/pubsub dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- sdks/go.mod | 2 +- sdks/go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sdks/go.mod b/sdks/go.mod index 16b39cdbe3a78..5d179736baf09 100644 --- a/sdks/go.mod +++ b/sdks/go.mod @@ -27,7 +27,7 @@ require ( cloud.google.com/go/bigtable v1.29.0 cloud.google.com/go/datastore v1.17.1 cloud.google.com/go/profiler v0.4.1 - cloud.google.com/go/pubsub v1.40.0 + cloud.google.com/go/pubsub v1.41.0 cloud.google.com/go/spanner v1.66.0 cloud.google.com/go/storage v1.43.0 github.com/aws/aws-sdk-go-v2 v1.30.3 diff --git a/sdks/go.sum b/sdks/go.sum index 8b25c7c000d5d..2b43c6d62bc92 100644 --- a/sdks/go.sum +++ b/sdks/go.sum @@ -449,8 +449,8 @@ cloud.google.com/go/pubsub v1.26.0/go.mod h1:QgBH3U/jdJy/ftjPhTkyXNj543Tin1pRYcd cloud.google.com/go/pubsub v1.27.1/go.mod h1:hQN39ymbV9geqBnfQq6Xf63yNhUAhv9CZhzp5O6qsW0= cloud.google.com/go/pubsub v1.28.0/go.mod h1:vuXFpwaVoIPQMGXqRyUQigu/AX1S3IWugR9xznmcXX8= cloud.google.com/go/pubsub v1.30.0/go.mod h1:qWi1OPS0B+b5L+Sg6Gmc9zD1Y+HaM0MdUr7LsupY1P4= -cloud.google.com/go/pubsub v1.40.0 h1:0LdP+zj5XaPAGtWr2V6r88VXJlmtaB/+fde1q3TU8M0= -cloud.google.com/go/pubsub v1.40.0/go.mod h1:BVJI4sI2FyXp36KFKvFwcfDRDfR8MiLT8mMhmIhdAeA= +cloud.google.com/go/pubsub v1.41.0 h1:ZPaM/CvTO6T+1tQOs/jJ4OEMpjtel0PTLV7j1JK+ZrI= +cloud.google.com/go/pubsub v1.41.0/go.mod h1:g+YzC6w/3N91tzG66e2BZtp7WrpBBMXVa3Y9zVoOGpk= cloud.google.com/go/pubsublite v1.5.0/go.mod h1:xapqNQ1CuLfGi23Yda/9l4bBCKz/wC3KIJ5gKcxveZg= cloud.google.com/go/pubsublite v1.6.0/go.mod h1:1eFCS0U11xlOuMFV/0iBqw3zP12kddMeCbj/F3FSj9k= cloud.google.com/go/pubsublite v1.7.0/go.mod h1:8hVMwRXfDfvGm3fahVbtDbiLePT3gpoiJYJY+vxWxVM= From 8b6f37b28746bcd8e6c8be4382ef72072cfdb959 Mon Sep 17 00:00:00 2001 From: jonathan-lemos Date: Mon, 12 Aug 2024 10:17:56 -0400 Subject: [PATCH 65/78] Fix broken Beam Quest URL in README.md (#32145) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0d564024f87df..8a9bb4e988d58 100644 --- a/README.md +++ b/README.md @@ -109,7 +109,7 @@ Here are some resources actively maintained by the Beam community to help you ge A comprehensive, interactive learning experience covering Beam concepts in depth. - Beam Quest + Beam Quest A certification granted by Google Cloud, certifying proficiency in Beam. From 2ce0ee34470519a1da61a2ac241997232891f48a Mon Sep 17 00:00:00 2001 From: Ayush Pandey <32012449+itsayushpandey@users.noreply.github.com> Date: Mon, 12 Aug 2024 08:35:32 -0600 Subject: [PATCH 66/78] Added insertion and enrichment pipeline (#31657) * Adding insertion and enrichment pipeline * Enhanced Data Schema * Added Apache Licensed to the notebook * Adding Chunking Strategy * removed unused imports * Modified insertion logic in redis for incorporating chunking strategy * refacted redis code * code review changes * Added chunking code in notebook * Added code review changes * Code review changes: using chunking strategy as enum * Added Code Review Changes * Code review changes * Added code review changes * Added Code Review Changes * Code review changes --- .../rag_usecase/beam_rag_notebook.ipynb | 1795 +++++++++++++++++ .../beam-ml/rag_usecase/chunks_generation.py | 129 ++ .../beam-ml/rag_usecase/redis_connector.py | 349 ++++ .../beam-ml/rag_usecase/redis_enrichment.py | 110 + 4 files changed, 2383 insertions(+) create mode 100644 examples/notebooks/beam-ml/rag_usecase/beam_rag_notebook.ipynb create mode 100644 examples/notebooks/beam-ml/rag_usecase/chunks_generation.py create mode 100644 examples/notebooks/beam-ml/rag_usecase/redis_connector.py create mode 100644 examples/notebooks/beam-ml/rag_usecase/redis_enrichment.py diff --git a/examples/notebooks/beam-ml/rag_usecase/beam_rag_notebook.ipynb b/examples/notebooks/beam-ml/rag_usecase/beam_rag_notebook.ipynb new file mode 100644 index 0000000000000..e271074af5551 --- /dev/null +++ b/examples/notebooks/beam-ml/rag_usecase/beam_rag_notebook.ipynb @@ -0,0 +1,1795 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "af127b51-1c7e-4e56-9759-aee40d9df194", + "metadata": {}, + "outputs": [], + "source": [ + "# @title ###### Licensed to the Apache Software Foundation (ASF), Version 2.0 (the \"License\")\n", + "\n", + "# Licensed to the Apache Software Foundation (ASF) under one\n", + "# or more contributor license agreements. See the NOTICE file\n", + "# distributed with this work for additional information\n", + "# regarding copyright ownership. The ASF licenses this file\n", + "# to you under the Apache License, Version 2.0 (the\n", + "# \"License\"); you may not use this file except in compliance\n", + "# with the License. You may obtain a copy of the License at\n", + "#\n", + "# http://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing,\n", + "# software distributed under the License is distributed on an\n", + "# \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY\n", + "# KIND, either express or implied. See the License for the\n", + "# specific language governing permissions and limitations\n", + "# under the License" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "160b9fee-00e9-4dd1-b1db-3d050e1bc710", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: pandas==1.4.4 in /usr/local/lib/python3.10/site-packages (1.4.4)\n", + "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/site-packages (from pandas==1.4.4) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/site-packages (from pandas==1.4.4) (2022.2.1)\n", + "Requirement already satisfied: numpy>=1.21.0 in /usr/local/lib/python3.10/site-packages (from pandas==1.4.4) (1.24.4)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/site-packages (from python-dateutil>=2.8.1->pandas==1.4.4) (1.16.0)\n", + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.1.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49m/usr/local/opt/python@3.10/bin/python3.10 -m pip install --upgrade pip\u001b[0m\n", + "Requirement already satisfied: numpy==1.24.4 in /usr/local/lib/python3.10/site-packages (1.24.4)\n", + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.1.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49m/usr/local/opt/python@3.10/bin/python3.10 -m pip install --upgrade pip\u001b[0m\n", + "Requirement already satisfied: apache_beam==2.56.0 in /usr/local/lib/python3.10/site-packages (2.56.0)\n", + "Requirement already satisfied: crcmod<2.0,>=1.7 in /usr/local/lib/python3.10/site-packages (from apache_beam==2.56.0) (1.7)\n", + "Requirement already satisfied: orjson<4,>=3.9.7 in /usr/local/lib/python3.10/site-packages (from apache_beam==2.56.0) (3.10.0)\n", + "Requirement already satisfied: dill<0.3.2,>=0.3.1.1 in /usr/local/lib/python3.10/site-packages (from apache_beam==2.56.0) (0.3.1.1)\n", + "Requirement already satisfied: cloudpickle~=2.2.1 in /usr/local/lib/python3.10/site-packages (from apache_beam==2.56.0) (2.2.1)\n", + "Requirement already satisfied: fastavro<2,>=0.23.6 in /usr/local/lib/python3.10/site-packages (from apache_beam==2.56.0) (1.9.4)\n", + "Requirement already satisfied: fasteners<1.0,>=0.3 in /usr/local/lib/python3.10/site-packages (from apache_beam==2.56.0) (0.19)\n", + "Requirement already satisfied: grpcio!=1.48.0,!=1.59.*,!=1.60.*,!=1.61.*,!=1.62.0,!=1.62.1,<2,>=1.33.1 in /usr/local/lib/python3.10/site-packages (from apache_beam==2.56.0) (1.53.0)\n", + "Requirement already satisfied: hdfs<3.0.0,>=2.1.0 in /usr/local/lib/python3.10/site-packages (from apache_beam==2.56.0) (2.7.3)\n", + "Requirement already satisfied: httplib2<0.23.0,>=0.8 in /usr/local/lib/python3.10/site-packages (from apache_beam==2.56.0) (0.22.0)\n", + "Requirement already satisfied: jsonschema<5.0.0,>=4.0.0 in /usr/local/lib/python3.10/site-packages (from apache_beam==2.56.0) (4.19.0)\n", + "Requirement already satisfied: jsonpickle<4.0.0,>=3.0.0 in /usr/local/lib/python3.10/site-packages (from apache_beam==2.56.0) (3.0.2)\n", + "Requirement already satisfied: numpy<1.27.0,>=1.14.3 in /usr/local/lib/python3.10/site-packages (from apache_beam==2.56.0) (1.24.4)\n", + "Requirement already satisfied: objsize<0.8.0,>=0.6.1 in /usr/local/lib/python3.10/site-packages (from apache_beam==2.56.0) (0.7.0)\n", + "Requirement already satisfied: packaging>=22.0 in /usr/local/lib/python3.10/site-packages (from apache_beam==2.56.0) (23.2)\n", + "Requirement already satisfied: pymongo<5.0.0,>=3.8.0 in /usr/local/lib/python3.10/site-packages (from apache_beam==2.56.0) (4.7.2)\n", + "Requirement already satisfied: proto-plus<2,>=1.7.1 in /usr/local/lib/python3.10/site-packages (from apache_beam==2.56.0) (1.23.0)\n", + "Requirement already satisfied: protobuf!=4.0.*,!=4.21.*,!=4.22.0,!=4.23.*,!=4.24.*,<4.26.0,>=3.20.3 in /usr/local/lib/python3.10/site-packages (from apache_beam==2.56.0) (3.20.3)\n", + "Requirement already satisfied: pydot<2,>=1.2.0 in /usr/local/lib/python3.10/site-packages (from apache_beam==2.56.0) (1.4.2)\n", + "Requirement already satisfied: python-dateutil<3,>=2.8.0 in /usr/local/lib/python3.10/site-packages (from apache_beam==2.56.0) (2.8.2)\n", + "Requirement already satisfied: pytz>=2018.3 in /usr/local/lib/python3.10/site-packages (from apache_beam==2.56.0) (2022.2.1)\n", + "Requirement already satisfied: redis<6,>=5.0.0 in /usr/local/lib/python3.10/site-packages (from apache_beam==2.56.0) (5.0.1)\n", + "Requirement already satisfied: regex>=2020.6.8 in /usr/local/lib/python3.10/site-packages (from apache_beam==2.56.0) (2023.3.23)\n", + "Requirement already satisfied: requests<3.0.0,>=2.24.0 in /usr/local/lib/python3.10/site-packages (from apache_beam==2.56.0) (2.31.0)\n", + "Requirement already satisfied: typing-extensions>=3.7.0 in /usr/local/lib/python3.10/site-packages (from apache_beam==2.56.0) (4.10.0)\n", + "Requirement already satisfied: zstandard<1,>=0.18.0 in /usr/local/lib/python3.10/site-packages (from apache_beam==2.56.0) (0.22.0)\n", + "Requirement already satisfied: pyarrow<15.0.0,>=3.0.0 in /usr/local/lib/python3.10/site-packages (from apache_beam==2.56.0) (10.0.1)\n", + "Requirement already satisfied: pyarrow-hotfix<1 in /usr/local/lib/python3.10/site-packages (from apache_beam==2.56.0) (0.5)\n", + "Requirement already satisfied: js2py<1,>=0.74 in /usr/local/lib/python3.10/site-packages (from apache_beam==2.56.0) (0.74)\n", + "Requirement already satisfied: docopt in /usr/local/lib/python3.10/site-packages (from hdfs<3.0.0,>=2.1.0->apache_beam==2.56.0) (0.6.2)\n", + "Requirement already satisfied: six>=1.9.0 in /usr/local/lib/python3.10/site-packages (from hdfs<3.0.0,>=2.1.0->apache_beam==2.56.0) (1.16.0)\n", + "Requirement already satisfied: pyparsing!=3.0.0,!=3.0.1,!=3.0.2,!=3.0.3,<4,>=2.4.2 in /usr/local/lib/python3.10/site-packages (from httplib2<0.23.0,>=0.8->apache_beam==2.56.0) (3.0.9)\n", + "Requirement already satisfied: tzlocal>=1.2 in /usr/local/lib/python3.10/site-packages (from js2py<1,>=0.74->apache_beam==2.56.0) (5.2)\n", + "Requirement already satisfied: pyjsparser>=2.5.1 in /usr/local/lib/python3.10/site-packages (from js2py<1,>=0.74->apache_beam==2.56.0) (2.7.1)\n", + "Requirement already satisfied: attrs>=22.2.0 in /usr/local/lib/python3.10/site-packages (from jsonschema<5.0.0,>=4.0.0->apache_beam==2.56.0) (23.1.0)\n", + "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.10/site-packages (from jsonschema<5.0.0,>=4.0.0->apache_beam==2.56.0) (2023.7.1)\n", + "Requirement already satisfied: referencing>=0.28.4 in /usr/local/lib/python3.10/site-packages (from jsonschema<5.0.0,>=4.0.0->apache_beam==2.56.0) (0.30.2)\n", + "Requirement already satisfied: rpds-py>=0.7.1 in /usr/local/lib/python3.10/site-packages (from jsonschema<5.0.0,>=4.0.0->apache_beam==2.56.0) (0.10.0)\n", + "Requirement already satisfied: dnspython<3.0.0,>=1.16.0 in /usr/local/lib/python3.10/site-packages (from pymongo<5.0.0,>=3.8.0->apache_beam==2.56.0) (2.6.1)\n", + "Requirement already satisfied: async-timeout>=4.0.2 in /usr/local/lib/python3.10/site-packages (from redis<6,>=5.0.0->apache_beam==2.56.0) (4.0.3)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/site-packages (from requests<3.0.0,>=2.24.0->apache_beam==2.56.0) (3.2.0)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/site-packages (from requests<3.0.0,>=2.24.0->apache_beam==2.56.0) (3.4)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/site-packages (from requests<3.0.0,>=2.24.0->apache_beam==2.56.0) (2.0.4)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/site-packages (from requests<3.0.0,>=2.24.0->apache_beam==2.56.0) (2023.7.22)\n", + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.1.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49m/usr/local/opt/python@3.10/bin/python3.10 -m pip install --upgrade pip\u001b[0m\n", + "Requirement already satisfied: redis==5.0.1 in /usr/local/lib/python3.10/site-packages (5.0.1)\n", + "Requirement already satisfied: async-timeout>=4.0.2 in /usr/local/lib/python3.10/site-packages (from redis==5.0.1) (4.0.3)\n", + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.1.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49m/usr/local/opt/python@3.10/bin/python3.10 -m pip install --upgrade pip\u001b[0m\n" + ] + } + ], + "source": [ + "#installing dependencies\n", + "!pip install pandas==1.4.4\n", + "!pip install numpy==1.24.4\n", + "!pip install apache_beam==2.56.0\n", + "!pip install redis==5.0.1\n", + "!pip install langchain==0.1.14 #used for chunking" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "bb8f59b0-254f-4b8e-a3dc-9015f35ef798", + "metadata": {}, + "outputs": [], + "source": [ + "#Imports Required for the notebook\n", + "import pandas as pd\n", + "import numpy as np\n", + "import apache_beam as beam\n", + "from apache_beam.ml.transforms.base import MLTransform\n", + "from apache_beam.transforms.enrichment import Enrichment\n", + "from apache_beam.ml.transforms.embeddings.huggingface import SentenceTransformerEmbeddings\n", + "import tempfile\n", + "import redis\n", + "import redis_connector\n", + "import redis_enrichment\n", + "from redis_connector import *\n", + "from redis_enrichment import *\n", + "from redis.commands.search.indexDefinition import (IndexDefinition,IndexType)\n", + "from redis.commands.search.query import Query\n", + "from redis.commands.search.field import (TextField,VectorField)\n", + "from chunks_generation import *" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "3d274890-4e6b-4a3d-b682-9fc6e21e5cca", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'2.56.0'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#To check beam version installed \n", + "beam.__version__" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c62d2ac3-36f5-42f2-8560-2e72421a1ff9", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "with open('hf_small_wikipedia.json', 'r') as j:\n", + " contents = json.loads(j.read())" + ] + }, + { + "cell_type": "markdown", + "id": "19c1c652-b9df-4f7e-bcb5-7ee2d290e091", + "metadata": {}, + "source": [ + "# For now Reading json data locally" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "fe0d6dc7-1809-44c9-9a36-b0781ec6731a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[{'id': '12', 'url': 'https://en.wikipedia.org/wiki/Anarchism', 'title': 'Anarchism', 'text': 'Anarchism is a political philosophy and movement that is skeptical of all ... \\nSocial theories\\nSocialism'}]\n" + ] + } + ], + "source": [ + "import json\n", + "\n", + "with open('hf_small_wikipedia.json', 'r') as j:\n", + " contents = json.loads(j.read())\n", + "\n", + "\n", + "print(contents[:1])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "947974b9-0218-4cb0-bd5a-1d57fd37c2f9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "list" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(contents)" + ] + }, + { + "cell_type": "markdown", + "id": "aa06d33f-ed94-4bea-8b33-04c947a99034", + "metadata": {}, + "source": [ + "# Create Redis Client for connecting to Redis Vector Database" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "43342378-18cd-4fd3-849c-2c6f8dc9a5ee", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "REDIS_HOST = \"localhost\"\n", + "REDIS_PORT = 6379\n", + "REDIS_PASSWORD = \"\" # default for passwordless Redis\n", + "\n", + "# Connect to Redis\n", + "redis_client = redis.Redis(\n", + " host=REDIS_HOST,\n", + " port=REDIS_PORT,\n", + " password=REDIS_PASSWORD\n", + ")\n", + "redis_client.ping()" + ] + }, + { + "cell_type": "markdown", + "id": "72cd4ad1-6577-453a-a2f7-947ae3149993", + "metadata": {}, + "source": [ + "# Creating a Search Index\n", + "Below cells will show how to specify and create a search index in Redis vector DB. Below are the following steps:\n", + "\n", + "1) Set some constants for defining our index like the distance metric and the index name\n", + "2) Define the index schema with RediSearch fields\n", + "3) Create the index" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "0a1b7250-6bb1-4f29-81fd-9934e7a457cc", + "metadata": {}, + "outputs": [], + "source": [ + "#Constants\n", + "EMBEDDING_MODEL = 'all-MiniLM-L6-v2' # Embedding model name to be use with ML Transform\n", + "VECTOR_DIM = 384 # length of the vector for above embedding model\n", + "VECTOR_NUMBER = 2 # initial number of vectors\n", + "INDEX_NAME = \"embeddings-index\" # name of the search index \n", + "PREFIX = \"doc\" # prefix for the document keys \n", + "DISTANCE_METRIC = \"COSINE\" # distance metric for the vectors (ex. COSINE, IP, L2)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "78ef7e80-680e-424d-b018-be3bd71008ba", + "metadata": {}, + "outputs": [], + "source": [ + "# Define RediSearch fields for each of the columns in the dataset\n", + "url = TextField(name=\"url\")\n", + "title = TextField(name=\"title\")\n", + "title_embedding = VectorField(\"title_vector\",\n", + " \"FLAT\", {\n", + " \"TYPE\": \"FLOAT32\",\n", + " \"DIM\": VECTOR_DIM,\n", + " \"DISTANCE_METRIC\": DISTANCE_METRIC,\n", + " \"INITIAL_CAP\": VECTOR_NUMBER,\n", + " }\n", + ")\n", + "\n", + "text = TextField(name=\"text\")\n", + "text_embedding = VectorField(\"text_vector\",\n", + " \"FLAT\", {\n", + " \"TYPE\": \"FLOAT32\",\n", + " \"DIM\": VECTOR_DIM,\n", + " \"DISTANCE_METRIC\": DISTANCE_METRIC,\n", + " \"INITIAL_CAP\": VECTOR_NUMBER,\n", + " }\n", + ")\n", + "fields = [url, title, title_embedding, text, text_embedding]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "120eabcf-a87a-4fdf-ba29-3117dec9d858", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index already exists\n" + ] + } + ], + "source": [ + "# Check if index exists\n", + "try:\n", + " redis_client.ft(INDEX_NAME).info()\n", + " print(\"Index already exists\")\n", + "except:\n", + " # Create RediSearch Index\n", + " redis_client.ft(INDEX_NAME).create_index(\n", + " fields = fields,\n", + " definition = IndexDefinition(prefix=[PREFIX], index_type=IndexType.HASH)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "f93ea35f-47d3-43b3-ab32-5dba16612337", + "metadata": {}, + "source": [ + "# Creating Knowledge Base in Redis Vector Database\n", + "After creating a search index, we can load documents into it. We will use the same documents we used in the previous cell." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "5736710f-b16d-405e-a1fe-f504e753b024", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:root:Missing pipeline option (runner). Executing pipeline using the default runner: DirectRunner.\n", + "WARNING:apache_beam.runners.interactive.interactive_environment:Dependencies required for Interactive Beam PCollection visualization are not available, please use: `pip install apache-beam[interactive]` to install necessary dependencies to enable all data visualization features.\n" + ] + }, + { + "data": { + "application/javascript": [ + "\n", + " if (typeof window.interactive_beam_jquery == 'undefined') {\n", + " var jqueryScript = document.createElement('script');\n", + " jqueryScript.src = 'https://code.jquery.com/jquery-3.4.1.slim.min.js';\n", + " jqueryScript.type = 'text/javascript';\n", + " jqueryScript.onload = function() {\n", + " var datatableScript = document.createElement('script');\n", + " datatableScript.src = 'https://cdn.datatables.net/1.10.20/js/jquery.dataTables.min.js';\n", + " datatableScript.type = 'text/javascript';\n", + " datatableScript.onload = function() {\n", + " window.interactive_beam_jquery = jQuery.noConflict(true);\n", + " window.interactive_beam_jquery(document).ready(function($){\n", + " \n", + " });\n", + " }\n", + " document.head.appendChild(datatableScript);\n", + " };\n", + " document.head.appendChild(jqueryScript);\n", + " } else {\n", + " window.interactive_beam_jquery(document).ready(function($){\n", + " \n", + " });\n", + " }" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:root:This output type hint will be ignored and not used for type-checking purposes. Typically, output type hints for a PTransform are single (or nested) types wrapped by a PCollection, PDone, or None. Got: Union[Tuple[apache_beam.pvalue.PCollection[~MLTransformOutputT], apache_beam.pvalue.PCollection[apache_beam.pvalue.Row]], apache_beam.pvalue.PCollection[~MLTransformOutputT]] instead.\n", + "WARNING:root:This output type hint will be ignored and not used for type-checking purposes. Typically, output type hints for a PTransform are single (or nested) types wrapped by a PCollection, PDone, or None. Got: Union[Tuple[apache_beam.pvalue.PCollection[~MLTransformOutputT], apache_beam.pvalue.PCollection[apache_beam.pvalue.Row]], apache_beam.pvalue.PCollection[~MLTransformOutputT]] instead.\n", + "WARNING:root:This output type hint will be ignored and not used for type-checking purposes. Typically, output type hints for a PTransform are single (or nested) types wrapped by a PCollection, PDone, or None. Got: Union[Tuple[apache_beam.pvalue.PCollection[~MLTransformOutputT], apache_beam.pvalue.PCollection[apache_beam.pvalue.Row]], apache_beam.pvalue.PCollection[~MLTransformOutputT]] instead.\n", + "WARNING:root:This output type hint will be ignored and not used for type-checking purposes. Typically, output type hints for a PTransform are single (or nested) types wrapped by a PCollection, PDone, or None. Got: Union[Tuple[apache_beam.pvalue.PCollection[~MLTransformOutputT], apache_beam.pvalue.PCollection[apache_beam.pvalue.Row]], apache_beam.pvalue.PCollection[~MLTransformOutputT]] instead.\n", + "INFO:apache_beam.runners.worker.statecache:Creating state cache with size 104857600\n", + "INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2\n", + "INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps\n", + "2024-08-09 13:01:57.330902: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", + "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2\n", + "INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps\n", + "INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2\n", + "INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps\n", + "INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2\n", + "INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps\n", + "INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2\n", + "INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps\n", + "INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2\n", + "INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps\n", + "INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2\n", + "INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps\n", + "INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2\n", + "INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps\n", + "INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2\n", + "INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps\n", + "INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2\n", + "INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps\n", + "INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2\n", + "INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps\n", + "INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2\n", + "INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "7042b4db72ae4741ad73040ec6888413", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Batches: 0%| | 0/1 [00:00> beam.Create(contents) \n", + " | \"Generate text chunks\" >> ChunksGeneration(chunk_size = 500, chunk_overlap = 0, chunking_strategy = ChunkingStrategy.SPLIT_BY_TOKENS)\n", + " | \"Insert document in Redis\" >> InsertDocInRedis(host='127.0.0.1',port=6379, batch_size=10)\n", + " | \"Generate Embeddings\" >> MLTransform(write_artifact_location=artifact_location).with_transform(generate_embedding_fn) \n", + " | \"Insert Embedding in Redis\" >> InsertEmbeddingInRedis(host='127.0.0.1',port=6379, batch_size=10,embedded_columns=['title','text'])\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "6206f83e-d3b6-4869-a7f2-2f662d3968f0", + "metadata": {}, + "source": [ + "## Pipeline Steps:\n", + "\n", + "Now that we have ingested the documents in Redis, we will create a embeddings transform, which is used for storing the text and its embedding in redis vector db\n" + ] + }, + { + "cell_type": "markdown", + "id": "f71e0cad-c062-4c12-9ba6-17010758f6db", + "metadata": {}, + "source": [ + "# Running Search Queries/ Perform Enrichment" + ] + }, + { + "cell_type": "markdown", + "id": "42697871-a5be-48cc-b961-799d69fc750b", + "metadata": {}, + "source": [ + "## Pipeline Steps:\n", + "Create a search transform, which emits the document Id, vector score along with the matching text from knowledge base\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "bc447d60-0588-4c6d-8a5c-b3f97e12461e", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:root:Missing pipeline option (runner). Executing pipeline using the default runner: DirectRunner.\n", + "WARNING:root:This output type hint will be ignored and not used for type-checking purposes. Typically, output type hints for a PTransform are single (or nested) types wrapped by a PCollection, PDone, or None. Got: Union[Tuple[apache_beam.pvalue.PCollection[~MLTransformOutputT], apache_beam.pvalue.PCollection[apache_beam.pvalue.Row]], apache_beam.pvalue.PCollection[~MLTransformOutputT]] instead.\n", + "WARNING:root:This output type hint will be ignored and not used for type-checking purposes. Typically, output type hints for a PTransform are single (or nested) types wrapped by a PCollection, PDone, or None. Got: Union[Tuple[apache_beam.pvalue.PCollection[~MLTransformOutputT], apache_beam.pvalue.PCollection[apache_beam.pvalue.Row]], apache_beam.pvalue.PCollection[~MLTransformOutputT]] instead.\n", + "WARNING:root:This output type hint will be ignored and not used for type-checking purposes. Typically, output type hints for a PTransform are single (or nested) types wrapped by a PCollection, PDone, or None. Got: Union[Tuple[apache_beam.pvalue.PCollection[~MLTransformOutputT], apache_beam.pvalue.PCollection[apache_beam.pvalue.Row]], apache_beam.pvalue.PCollection[~MLTransformOutputT]] instead.\n", + "WARNING:root:This output type hint will be ignored and not used for type-checking purposes. Typically, output type hints for a PTransform are single (or nested) types wrapped by a PCollection, PDone, or None. Got: Union[Tuple[apache_beam.pvalue.PCollection[~MLTransformOutputT], apache_beam.pvalue.PCollection[apache_beam.pvalue.Row]], apache_beam.pvalue.PCollection[~MLTransformOutputT]] instead.\n", + "INFO:apache_beam.runners.worker.statecache:Creating state cache with size 104857600\n", + "INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2\n", + "INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "0230c48548aa4229aaecabdea860a5f3", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Batches: 0%| | 0/1 [00:00> beam.Create(data)\n", + " | \"Generate Embedding\" >> MLTransform(write_artifact_location=artifact_location).with_transform(generate_embedding_fn)\n", + " | \"Enrich W/ Redis\" >> Enrichment(redis_handler)\n", + " | \"Print\" >> beam.Map(print)\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "9126f5a9-179e-4059-b868-0838b0944902", + "metadata": {}, + "source": [ + "# Conclusion\n", + "\n", + "Here we have demonstrated how we can implement Ingestion and Enrichment pipeline using redis vector DB by using ML Transfrom's SentenceTransformerEmbeddings for generating the embeddings of the text chunks." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/notebooks/beam-ml/rag_usecase/chunks_generation.py b/examples/notebooks/beam-ml/rag_usecase/chunks_generation.py new file mode 100644 index 0000000000000..1dd85d12f633c --- /dev/null +++ b/examples/notebooks/beam-ml/rag_usecase/chunks_generation.py @@ -0,0 +1,129 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import absolute_import + +import apache_beam as beam +from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter +from langchain_text_splitters import SentenceTransformersTokenTextSplitter + +from apache_beam.transforms import DoFn +from apache_beam.transforms import PTransform +from enum import Enum + + +__all__ = [ + 'ChunksGeneration', + 'ChunkingStrategy' +] + +class ChunkingStrategy(Enum): + SPLIT_BY_CHARACTER = 0 + RECURSIVE_SPLIT_BY_CHARACTER = 1 + SPLIT_BY_TOKENS = 2 + + +class ChunksGeneration(PTransform): + """ChunkingStrategy is a ``PTransform`` that takes a ``PCollection`` of + key, value tuple or 2-element array and generates different chunks for documents. + """ + + def __init__( + self, + chunk_size: int, + chunk_overlap: int, + chunking_strategy: ChunkingStrategy + ): + """ + + Args: + chunk_size : Chunk size is the maximum number of characters that a chunk can contain + chunk_overlap : the number of characters that should overlap between two adjacent chunks + chunking_strategy : Defines the way to split text + + Returns: + :class:`~apache_beam.transforms.ptransform.PTransform` + + """ + + self.chunk_size = chunk_size + self.chunk_overlap = chunk_overlap + self.chunking_strategy = chunking_strategy + + def expand(self, pcoll): + return pcoll \ + | "Generate text chunks" >> beam.ParDo(_GenerateChunksFn(self.chunk_size, + self.chunk_overlap, + self.chunking_strategy)) + + +class _GenerateChunksFn(DoFn): + """Abstract class that takes in ptransform + and generate chunks. + """ + + def __init__( + self, + chunk_size: int, + chunk_overlap: int, + chunking_strategy: ChunkingStrategy + ): + + self.chunk_size = chunk_size + self.chunk_overlap = chunk_overlap + self.chunking_strategy = chunking_strategy + + def process(self, element, *args, **kwargs): + + # For recursive split by character + if self.chunking_strategy == ChunkingStrategy.RECURSIVE_SPLIT_BY_CHARACTER: + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=self.chunk_size, + chunk_overlap=self.chunk_overlap, + length_function=len, + is_separator_regex=False, + ) + + # For split by character + elif self.chunking_strategy == ChunkingStrategy.SPLIT_BY_CHARACTER: + text_splitter = CharacterTextSplitter( + chunk_size=self.chunk_size, + chunk_overlap=self.chunk_overlap, + length_function=len, + is_separator_regex=False, + ) + + # For split by tokens + elif self.chunking_strategy == ChunkingStrategy.SPLIT_BY_TOKENS: + text_splitter = SentenceTransformersTokenTextSplitter( + chunk_overlap=self.chunk_overlap, + model_name='all-MiniLM-L6-v2' + ) + + else: + raise ValueError(f"Invalid chunking strategy: {self.chunking_strategy}") + + texts = text_splitter.split_text(element['text'])[:] + + element_copy = element.copy() + del element_copy['text'] + for i, section in enumerate(texts): + element_copy['text'] = section + element_copy['section_id'] = i + 1 + yield element_copy + + diff --git a/examples/notebooks/beam-ml/rag_usecase/redis_connector.py b/examples/notebooks/beam-ml/rag_usecase/redis_connector.py new file mode 100644 index 0000000000000..039e5bee95e97 --- /dev/null +++ b/examples/notebooks/beam-ml/rag_usecase/redis_connector.py @@ -0,0 +1,349 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import absolute_import + +import apache_beam as beam +import numpy as np + +from apache_beam.transforms import DoFn +from apache_beam.transforms import PTransform +from apache_beam.transforms import Reshuffle + + +import redis +from typing import Optional + +# Set the logging level to reduce verbose information +import logging + +logging.root.setLevel(logging.INFO) +logger = logging.getLogger(__name__) + +__all__ = ['InsertDocInRedis', 'InsertEmbeddingInRedis'] + + + +"""This module implements IO classes to read write documents in Redis. + + +Insert Doc in Redis: +----------------- +:class:`InsertDocInRedis` is a ``PTransform`` that writes key and values to a +configured sink, and the write is conducted through a redis pipeline. + +The ptransform works by getting the first and second elements from the input, +this means that inputs like `[k,v]` or `(k,v)` are valid. + +Example usage:: + + pipeline | InsertDocInRedis(host='localhost', + port=6379, + batch_size=100) +""" + + +class InsertDocInRedis(PTransform): + """InsertDocInRedis is a ``PTransform`` that writes a ``PCollection`` of + key, value tuple or 2-element array into a redis server. + """ + + def __init__(self, + host: str, + port: int, + command: Optional[str] = None, + batch_size: int = 100 + ): + + """ + + Args: + host (str): The redis host + port (int): The redis port + command (str): command to be executed with redis client + batch_size(int): Number of key, values pairs to write at once + + Returns: + :class:`~apache_beam.transforms.ptransform.PTransform` + + """ + + self._host = host + self._port = port + self._command = command + self._batch_size = batch_size + + def expand(self, pcoll): + return pcoll \ + | "Reshuffle for Redis Insert" >> Reshuffle() \ + | "Insert document into Redis" >> beam.ParDo(_InsertDocRedisFn(self._host, + self._port, + self._command, + self._batch_size) + ) + + +class _InsertDocRedisFn(DoFn): + """Abstract class that takes in redis + credentials to connect to redis DB + """ + + def __init__(self, + host: str, + port: int, + command: Optional[str] = None, + batch_size: int = 100 + ): + self.host = host + self.port = port + self.command = command + self.batch_size = batch_size + + self.batch_counter = 0 + self.batch = list() + + self.text_col = None + + def finish_bundle(self): + self._flush() + + def process(self, element, *args, **kwargs): + self.batch.append(element) + self.batch_counter += 1 + if self.batch_counter >= self.batch_size: + self._flush() + yield element + + def _flush(self): + if self.batch_counter == 0: + return + + with _InsertDocRedisSink(self.host, self.port) as sink: + + if not self.command: + sink.write(self.batch) + + else: + sink.execute_command(self.command, self.batch) + + self.batch_counter = 0 + self.batch = list() + + +class _InsertDocRedisSink(object): + """Class where we create redis client + and write insertion logic in redis + """ + + def __init__(self, + host: str, + port: int + ): + self.host = host + self.port = port + self.client = None + + def _create_client(self): + if self.client is None: + self.client = redis.Redis(host=self.host, + port=self.port) + + def write(self, elements): + self._create_client() + with self.client.pipeline() as pipe: + logger.info(f'Inserting documents in Redis. Total docs: {len(elements)}') + for element in elements: + doc_key = f"doc_{str(element['id'])}_section_{str(element['section_id'])}" + for k, v in element.items(): + logger.debug(f'Inserting doc_key={doc_key}, key={k}, value={v}') + pipe.hset(name=doc_key, key=k, value=v) + + pipe.execute() + logger.info(f'Inserting documents complete.') + + + def execute_command(self, command, elements): + self._create_client() + with self.client.pipeline() as pipe: + for element in elements: + k, v = element + pipe.execute_command(command, k, v) + pipe.execute() + + def __enter__(self): + self._create_client() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if self.client is not None: + self.client.close() + + +"""This module implements IO classes to read write text Embeddings in Redis. + + +Insert Embedding in Redis : +----------------- +:class:`InsertEmbeddingInRedis` is a ``PTransform`` that writes key and values to a +configured sink, and the write is conducted through a redis pipeline. + +The ptransform works by getting the first and second elements from the input, +this means that inputs like `[k,v]` or `(k,v)` are valid. + +Example usage:: + + pipeline | InsertEmbeddingInRedis(host='localhost', + port=6379, + batch_size=100) +""" + + +class InsertEmbeddingInRedis(PTransform): + """WriteToRedis is a ``PTransform`` that writes a ``PCollection`` of + key, value tuple or 2-element array into a redis server. + """ + + def __init__(self, + host: str, + port: int, + command: Optional[str] = None, + batch_size: int = 100, + embedded_columns: list = [] + ): + + """ + + Args: + host (str): The redis host + port (int): The redis port + command (str): command to be executed with redis client + batch_size (int): Number of key, values pairs to write at once + embedded_columns (list): list of column whose embedding needs to be generated + + Returns: + :class:`~apache_beam.transforms.ptransform.PTransform` + + """ + + self._host = host + self._port = port + self._command = command + self._batch_size = batch_size + self.embedded_columns = embedded_columns + + def expand(self, pcoll): + return pcoll \ + | "Reshuffle for Embedding in Redis Insert" >> Reshuffle() \ + | "Write `Embeddings` to Redis" >> beam.ParDo(_WriteEmbeddingInRedisFn(self._host, + self._port, + self._command, + self._batch_size, + self.embedded_columns)) + + +class _WriteEmbeddingInRedisFn(DoFn): + """Abstract class that takes in redis credentials + to connect to redis DB + """ + + def __init__(self, + host: str, + port: int, + command: Optional[str] = None, + batch_size: int = 100, + embedded_columns: list = [] + ): + self.host = host + self.port = port + self.command = command + self.batch_size = batch_size + self.embedded_columns = embedded_columns + + self.batch_counter = 0 + self.batch = list() + + def finish_bundle(self): + self._flush() + + def process(self, element, *args, **kwargs): + self.batch.append(element) + self.batch_counter += 1 + if self.batch_counter >= self.batch_size: + self._flush() + + def _flush(self): + if self.batch_counter == 0: + return + + with _InsertEmbeddingInRedisSink(self.host, self.port, self.embedded_columns) as sink: + + if not self.command: + sink.write(self.batch) + + else: + sink.execute_command(self.command, self.batch) + + self.batch_counter = 0 + self.batch = list() + + +class _InsertEmbeddingInRedisSink(object): + """Class where we create redis client + and write text embedding in redis DB + """ + + def __init__(self, + host: str, + port: int, + embedded_columns: list = [] + ): + self.host = host + self.port = port + self.client = None + self.embedded_columns = embedded_columns + + def _create_client(self): + if self.client is None: + self.client = redis.Redis(host=self.host, + port=self.port) + + def write(self, elements): + self._create_client() + with self.client.pipeline() as pipe: + for element in elements: + doc_key = f"doc_{str(element['id'])}_section_{str(element['section_id'])}" + for k, v in element.items(): + if k in self.embedded_columns: + v = np.array(v, dtype=np.float32).tobytes() + pipe.hset(name=doc_key, key=f'{k}_vector', value=v) + pipe.execute() + + def execute_command(self, command, elements): + self._create_client() + with self.client.pipeline() as pipe: + for element in elements: + k, v = element + pipe.execute_command(command, k, v) + pipe.execute() + + def __enter__(self): + self._create_client() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if self.client is not None: + self.client.close() \ No newline at end of file diff --git a/examples/notebooks/beam-ml/rag_usecase/redis_enrichment.py b/examples/notebooks/beam-ml/rag_usecase/redis_enrichment.py new file mode 100644 index 0000000000000..df00ede790df8 --- /dev/null +++ b/examples/notebooks/beam-ml/rag_usecase/redis_enrichment.py @@ -0,0 +1,110 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This module implements enrichment classes to implement semantic search on Redis Vector DB. + + +Redis :Enrichment Handler +----------------- +:class:`RedisEnrichmentHandler` is a ``EnrichmentSourceHandler`` that performs enrichment/search +by fetching the similar text to the user query/prompt from the knowledge base (redis vector DB) and returns +the similar text along with its embeddings as Beam.Row Object. + +Example usage:: + redis_handler = RedisEnrichmentHandler(redis_host='127.0.0.1', redis_port=6379) + + pipeline | Enrichment(redis_handler) + +No backward compatibility guarantees. Everything in this module is experimental. +""" + +import numpy as np +import redis +from redis.commands.search.query import Query + +import apache_beam as beam +from apache_beam.transforms.enrichment import EnrichmentSourceHandler + +__all__ = [ + 'RedisEnrichmentHandler', +] + + +class RedisEnrichmentHandler(EnrichmentSourceHandler[beam.Row, beam.Row]): + """A handler for :class:`apache_beam.transforms.enrichment.Enrichment` + transform to interact with redis vector DB. + + Args: + redis_host (str): Redis Host to connect to redis DB + redis_port (int): Redis Port to connect to redis DB + index_name (str): Index Name created for searching in Redis DB + vector_field (str): vector field to compute similarity score in vector DB + return_fields (list): returns list of similar text and its embeddings + hybrid_fields (str): fields to be selected + k (int): Value of K in KNN algorithm for searching in redis + """ + + def __init__( + self, + redis_host: str, + redis_port: int, + index_name: str = "embeddings-index", + vector_field: str = "text_vector", + return_fields: list = ["id", "title", "url", "text"], + hybrid_fields: str = "*", + k: int = 2, + ): + self.redis_host = redis_host + self.redis_port = redis_port + self.index_name = index_name + self.vector_field = vector_field + self.return_fields = return_fields + self.hybrid_fields = hybrid_fields + self.k = k + self.client = None + + def __enter__(self): + """connect to the redis DB using redis client.""" + self.client = redis.Redis(host=self.redis_host, port=self.redis_port) + + def __call__(self, request: beam.Row, *args, **kwargs): + """ + Reads a row from the redis Vector DB and returns + a `Tuple` of request and response. + + Args: + request: the input `beam.Row` to enrich. + """ + + # read embedding vector for user query + + embedded_query = request['text'] + + # Prepare the Query + base_query = f'{self.hybrid_fields}=>[KNN {self.k} @{self.vector_field} $vector AS vector_score]' + query = ( + Query(base_query) + .return_fields(*self.return_fields) + .paging(0, self.k) + .dialect(2) + ) + + params_dict = {"vector": np.array(embedded_query).astype(dtype=np.float32).tobytes()} + + # perform vector search + results = self.client.ft(self.index_name).search(query, params_dict) + + return beam.Row(text=embedded_query), beam.Row(docs=results.docs) \ No newline at end of file From 9aaf7e41dec8b9d3effdc7cb6c887f8df8e62a64 Mon Sep 17 00:00:00 2001 From: Svetak Sundhar Date: Mon, 12 Aug 2024 11:00:54 -0400 Subject: [PATCH 67/78] fix link (#32156) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8a9bb4e988d58..8a6db16220683 100644 --- a/README.md +++ b/README.md @@ -109,7 +109,7 @@ Here are some resources actively maintained by the Beam community to help you ge A comprehensive, interactive learning experience covering Beam concepts in depth. - Beam Quest + Beam Quest A certification granted by Google Cloud, certifying proficiency in Beam. From edf4c4f5f19ef6ffd25493262261c713ba045980 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 12 Aug 2024 10:20:55 -0700 Subject: [PATCH 68/78] Bump golang.org/x/sys from 0.23.0 to 0.24.0 in /sdks (#32150) Bumps [golang.org/x/sys](https://github.com/golang/sys) from 0.23.0 to 0.24.0. - [Commits](https://github.com/golang/sys/compare/v0.23.0...v0.24.0) --- updated-dependencies: - dependency-name: golang.org/x/sys dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- sdks/go.mod | 2 +- sdks/go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sdks/go.mod b/sdks/go.mod index 5d179736baf09..7c3ade37490d1 100644 --- a/sdks/go.mod +++ b/sdks/go.mod @@ -56,7 +56,7 @@ require ( golang.org/x/net v0.28.0 golang.org/x/oauth2 v0.22.0 golang.org/x/sync v0.8.0 - golang.org/x/sys v0.23.0 + golang.org/x/sys v0.24.0 golang.org/x/text v0.17.0 google.golang.org/api v0.191.0 google.golang.org/genproto v0.0.0-20240730163845-b1a4ccb954bf diff --git a/sdks/go.sum b/sdks/go.sum index 2b43c6d62bc92..5b9bfaef8a48d 100644 --- a/sdks/go.sum +++ b/sdks/go.sum @@ -1507,8 +1507,8 @@ golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/sys v0.23.0 h1:YfKFowiIMvtgl1UERQoTPPToxltDeZfbj4H7dVUCwmM= -golang.org/x/sys v0.23.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.24.0 h1:Twjiwq9dn6R1fQcyiK+wQyHWfaz/BJB+YIpzU/Cv3Xg= +golang.org/x/sys v0.24.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.2.0/go.mod h1:TVmDHMZPmdnySmBfhjOoOdhjzdE1h4u1VwSiw2l1Nuc= From df10609129093bebdd857d4d9e6daaf68d3cbfbb Mon Sep 17 00:00:00 2001 From: Damon Date: Mon, 12 Aug 2024 14:52:26 -0700 Subject: [PATCH 69/78] Update names.py with container image tag (#32160) --- sdks/python/apache_beam/runners/dataflow/internal/names.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/python/apache_beam/runners/dataflow/internal/names.py b/sdks/python/apache_beam/runners/dataflow/internal/names.py index 40147e9926dc6..3d51bd21e705b 100644 --- a/sdks/python/apache_beam/runners/dataflow/internal/names.py +++ b/sdks/python/apache_beam/runners/dataflow/internal/names.py @@ -34,6 +34,6 @@ # Unreleased sdks use container image tag specified below. # Update this tag whenever there is a change that # requires changes to SDK harness container or SDK harness launcher. -BEAM_DEV_SDK_CONTAINER_TAG = 'beam-master-20240613' +BEAM_DEV_SDK_CONTAINER_TAG = 'beam-master-20240809' DATAFLOW_CONTAINER_IMAGE_REPOSITORY = 'gcr.io/cloud-dataflow/v1beta3' From 74668038c023a451bd84075c2ea33ee71c67336d Mon Sep 17 00:00:00 2001 From: Vitaly Terentyev Date: Tue, 13 Aug 2024 01:52:54 +0400 Subject: [PATCH 70/78] Support withFormatRecordOnFailureFunction() for BigQuery STORAGE_WRITE_API and STORAGE_API_AT_LEAST_ONCE methods (#31659) * Support withFormatRecordOnFailureFunction() for BigQuery STORAGE_WRITE_API and STORAGE_API_AT_LEAST_ONCE methods * Update CHANGES.md --- CHANGES.md | 1 + .../beam/sdk/io/gcp/bigquery/BigQueryIO.java | 18 +- .../io/gcp/bigquery/SplittingIterable.java | 18 +- .../bigquery/StorageApiConvertMessages.java | 8 +- .../StorageApiDynamicDestinations.java | 2 +- .../StorageApiDynamicDestinationsBeamRow.java | 16 +- ...geApiDynamicDestinationsGenericRecord.java | 20 +- .../StorageApiDynamicDestinationsProto.java | 31 +- ...StorageApiDynamicDestinationsTableRow.java | 16 +- .../gcp/bigquery/StorageApiWritePayload.java | 22 +- .../StorageApiWriteUnshardedRecords.java | 65 ++-- .../StorageApiWritesShardedRecords.java | 26 +- .../io/gcp/bigquery/BigQueryIOWriteTest.java | 336 +++++++++++++++++- 13 files changed, 522 insertions(+), 57 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index d082f03fd310e..950abc694488e 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -69,6 +69,7 @@ * X feature added (Java/Python) ([#X](https://github.com/apache/beam/issues/X)). * Go SDK Minimum Go Version updated to 1.21 ([#32092](https://github.com/apache/beam/pull/32092)). +* [BigQueryIO] Added support for withFormatRecordOnFailureFunction() for STORAGE_WRITE_API and STORAGE_API_AT_LEAST_ONCE methods (Java) ([#31354](https://github.com/apache/beam/issues/31354)). * Updated Go protobuf package to new version (Go) ([#21515](https://github.com/apache/beam/issues/21515)). ## Breaking Changes diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.java index 1238271c791ea..2a16bf31a6cba 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.java @@ -2711,9 +2711,14 @@ public Write withFormatFunction(SerializableFunction formatFunct } /** - * If an insert failure occurs, this function is applied to the originally supplied row T. The - * resulting {@link TableRow} will be accessed via {@link - * WriteResult#getFailedInsertsWithErr()}. + * If an insert failure occurs, this function is applied to the originally supplied T element. + * + *

    For {@link Method#STREAMING_INSERTS} method, the resulting {@link TableRow} will be + * accessed via {@link WriteResult#getFailedInsertsWithErr()}. + * + *

    For {@link Method#STORAGE_WRITE_API} and {@link Method#STORAGE_API_AT_LEAST_ONCE} methods, + * the resulting {@link TableRow} will be accessed via {@link + * WriteResult#getFailedStorageApiInserts()}. */ public Write withFormatRecordOnFailureFunction( SerializableFunction formatFunction) { @@ -3773,6 +3778,7 @@ private WriteResult continueExpandTyped( dynamicDestinations, elementSchema, elementToRowFunction, + getFormatRecordOnFailureFunction(), getRowMutationInformationFn() != null); } else if (getWriteProtosClass() != null && getDirectWriteProtos()) { // We could support both of these by falling back to @@ -3795,7 +3801,9 @@ private WriteResult continueExpandTyped( storageApiDynamicDestinations = (StorageApiDynamicDestinations) new StorageApiDynamicDestinationsProto( - dynamicDestinations, getWriteProtosClass()); + dynamicDestinations, + getWriteProtosClass(), + getFormatRecordOnFailureFunction()); } else if (getAvroRowWriterFactory() != null) { // we can configure the avro to storage write api proto converter for this // assuming the format function returns an Avro GenericRecord @@ -3818,6 +3826,7 @@ private WriteResult continueExpandTyped( dynamicDestinations, avroSchemaFactory, recordWriterFactory.getToAvroFn(), + getFormatRecordOnFailureFunction(), getRowMutationInformationFn() != null); } else { RowWriterFactory.TableRowWriterFactory tableRowWriterFactory = @@ -3827,6 +3836,7 @@ private WriteResult continueExpandTyped( new StorageApiDynamicDestinationsTableRow<>( dynamicDestinations, tableRowWriterFactory.getToRowFn(), + getFormatRecordOnFailureFunction(), getRowMutationInformationFn() != null, getCreateDisposition(), getIgnoreUnknownValues(), diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/SplittingIterable.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/SplittingIterable.java index b8eeb2522cf2b..e40824eab08b6 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/SplittingIterable.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/SplittingIterable.java @@ -21,6 +21,7 @@ import com.google.auto.value.AutoValue; import com.google.cloud.bigquery.storage.v1.ProtoRows; import com.google.protobuf.ByteString; +import java.io.IOException; import java.util.Iterator; import java.util.List; import java.util.NoSuchElementException; @@ -42,6 +43,8 @@ abstract static class Value { abstract ProtoRows getProtoRows(); abstract List getTimestamps(); + + abstract List<@Nullable TableRow> getFailsafeTableRows(); } interface ConvertUnknownFields { @@ -96,11 +99,18 @@ public Value next() { } List timestamps = Lists.newArrayList(); + List<@Nullable TableRow> failsafeRows = Lists.newArrayList(); ProtoRows.Builder inserts = ProtoRows.newBuilder(); long bytesSize = 0; while (underlyingIterator.hasNext()) { StorageApiWritePayload payload = underlyingIterator.next(); ByteString byteString = ByteString.copyFrom(payload.getPayload()); + @Nullable TableRow failsafeTableRow = null; + try { + failsafeTableRow = payload.getFailsafeTableRow(); + } catch (IOException e) { + // Do nothing, table row will be generated later from row bytes + } if (autoUpdateSchema) { try { @Nullable TableRow unknownFields = payload.getUnknownFields(); @@ -116,7 +126,10 @@ public Value next() { // This generally implies that ignoreUnknownValues=false and there were still // unknown values here. // Reconstitute the TableRow and send it to the failed-rows consumer. - TableRow tableRow = protoToTableRow.apply(byteString); + TableRow tableRow = + failsafeTableRow != null + ? failsafeTableRow + : protoToTableRow.apply(byteString); // TODO(24926, reuvenlax): We need to merge the unknown fields in! Currently we // only execute this // codepath when ignoreUnknownFields==true, so we should never hit this codepath. @@ -142,12 +155,13 @@ public Value next() { timestamp = elementsTimestamp; } timestamps.add(timestamp); + failsafeRows.add(failsafeTableRow); bytesSize += byteString.size(); if (bytesSize > splitSize) { break; } } - return new AutoValue_SplittingIterable_Value(inserts.build(), timestamps); + return new AutoValue_SplittingIterable_Value(inserts.build(), timestamps, failsafeRows); } }; } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiConvertMessages.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiConvertMessages.java index aefdb79c535c8..0c6f82b9df813 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiConvertMessages.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiConvertMessages.java @@ -179,15 +179,17 @@ public void processElement( .withTimestamp(timestamp); o.get(successfulWritesTag).output(KV.of(element.getKey(), payload)); } catch (TableRowToStorageApiProto.SchemaConversionException conversionException) { - TableRow tableRow; + TableRow failsafeTableRow; try { - tableRow = messageConverter.toTableRow(element.getValue()); + failsafeTableRow = messageConverter.toFailsafeTableRow(element.getValue()); } catch (Exception e) { badRecordRouter.route(o, element, elementCoder, e, "Unable to convert value to TableRow"); return; } o.get(failedWritesTag) - .output(new BigQueryStorageApiInsertError(tableRow, conversionException.toString())); + .output( + new BigQueryStorageApiInsertError( + failsafeTableRow, conversionException.toString())); } catch (Exception e) { badRecordRouter.route( o, element, elementCoder, e, "Unable to convert value to StorageWriteApiPayload"); diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiDynamicDestinations.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiDynamicDestinations.java index 8ec4d52e3b90f..87667ef2cb171 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiDynamicDestinations.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiDynamicDestinations.java @@ -34,7 +34,7 @@ public interface MessageConverter { StorageApiWritePayload toMessage( T element, @Nullable RowMutationInformation rowMutationInformation) throws Exception; - TableRow toTableRow(T element); + TableRow toFailsafeTableRow(T element); } StorageApiDynamicDestinations(DynamicDestinations inner) { diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiDynamicDestinationsBeamRow.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiDynamicDestinationsBeamRow.java index 70ecb06d5b8d7..fd5fe27f0c7c6 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiDynamicDestinationsBeamRow.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiDynamicDestinationsBeamRow.java @@ -35,6 +35,7 @@ class StorageApiDynamicDestinationsBeamRow { private final TableSchema tableSchema; private final SerializableFunction toRow; + private final @Nullable SerializableFunction formatRecordOnFailureFunction; private final boolean usesCdc; @@ -42,10 +43,12 @@ class StorageApiDynamicDestinationsBeamRow inner, Schema schema, SerializableFunction toRow, + @Nullable SerializableFunction formatRecordOnFailureFunction, boolean usesCdc) { super(inner); this.tableSchema = BeamRowToStorageApiProto.protoTableSchemaFromBeamSchema(schema); this.toRow = toRow; + this.formatRecordOnFailureFunction = formatRecordOnFailureFunction; this.usesCdc = usesCdc; } @@ -96,12 +99,19 @@ public StorageApiWritePayload toMessage( Message msg = BeamRowToStorageApiProto.messageFromBeamRow( descriptorToUse, toRow.apply(element), changeType, changeSequenceNum); - return StorageApiWritePayload.of(msg.toByteArray(), null); + return StorageApiWritePayload.of( + msg.toByteArray(), + null, + formatRecordOnFailureFunction != null ? toFailsafeTableRow(element) : null); } @Override - public TableRow toTableRow(T element) { - return BigQueryUtils.toTableRow(toRow.apply(element)); + public TableRow toFailsafeTableRow(T element) { + if (formatRecordOnFailureFunction != null) { + return formatRecordOnFailureFunction.apply(element); + } else { + return BigQueryUtils.toTableRow(toRow.apply(element)); + } } }; } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiDynamicDestinationsGenericRecord.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiDynamicDestinationsGenericRecord.java index c96bb4ce75236..a387495863a26 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiDynamicDestinationsGenericRecord.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiDynamicDestinationsGenericRecord.java @@ -36,16 +36,21 @@ class StorageApiDynamicDestinationsGenericRecord, GenericRecord> toGenericRecord; private final SerializableFunction<@Nullable TableSchema, Schema> schemaFactory; + private final @javax.annotation.Nullable SerializableFunction + formatRecordOnFailureFunction; + private boolean usesCdc; StorageApiDynamicDestinationsGenericRecord( DynamicDestinations inner, SerializableFunction<@Nullable TableSchema, Schema> schemaFactory, SerializableFunction, GenericRecord> toGenericRecord, + @Nullable SerializableFunction formatRecordOnFailureFunction, boolean usesCdc) { super(inner); this.toGenericRecord = toGenericRecord; this.schemaFactory = schemaFactory; + this.formatRecordOnFailureFunction = formatRecordOnFailureFunction; this.usesCdc = usesCdc; } @@ -96,13 +101,20 @@ public StorageApiWritePayload toMessage( toGenericRecord.apply(new AvroWriteRequest<>(element, avroSchema)), changeType, changeSequenceNum); - return StorageApiWritePayload.of(msg.toByteArray(), null); + return StorageApiWritePayload.of( + msg.toByteArray(), + null, + formatRecordOnFailureFunction != null ? toFailsafeTableRow(element) : null); } @Override - public TableRow toTableRow(T element) { - return BigQueryUtils.convertGenericRecordToTableRow( - toGenericRecord.apply(new AvroWriteRequest<>(element, avroSchema)), bqTableSchema); + public TableRow toFailsafeTableRow(T element) { + if (formatRecordOnFailureFunction != null) { + return formatRecordOnFailureFunction.apply(element); + } else { + return BigQueryUtils.convertGenericRecordToTableRow( + toGenericRecord.apply(new AvroWriteRequest<>(element, avroSchema)), bqTableSchema); + } } @Override diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiDynamicDestinationsProto.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiDynamicDestinationsProto.java index 57dbdc9d1e770..d7359f99b96df 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiDynamicDestinationsProto.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiDynamicDestinationsProto.java @@ -22,23 +22,29 @@ import com.google.cloud.bigquery.storage.v1.TableSchema; import com.google.protobuf.DescriptorProtos; import com.google.protobuf.Descriptors; +import com.google.protobuf.DynamicMessage; import com.google.protobuf.Message; import java.lang.reflect.InvocationTargetException; import javax.annotation.Nullable; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.DatasetService; +import org.apache.beam.sdk.transforms.SerializableFunction; import org.apache.beam.sdk.util.Preconditions; import org.checkerframework.checker.nullness.qual.NonNull; /** Storage API DynamicDestinations used when the input is a compiled protocol buffer. */ class StorageApiDynamicDestinationsProto extends StorageApiDynamicDestinations { - DescriptorProtos.DescriptorProto descriptorProto; + private final DescriptorProtos.DescriptorProto descriptorProto; + private final @Nullable SerializableFunction formatRecordOnFailureFunction; @SuppressWarnings({"unchecked", "nullness"}) StorageApiDynamicDestinationsProto( - DynamicDestinations inner, Class protoClass) { + DynamicDestinations inner, + Class protoClass, + @Nullable SerializableFunction formatRecordOnFailureFunction) { super(inner); try { + this.formatRecordOnFailureFunction = formatRecordOnFailureFunction; this.descriptorProto = fixNestedTypes( (Descriptors.Descriptor) @@ -84,12 +90,27 @@ public StorageApiWritePayload toMessage( // we can forward // the through directly. This means that we don't currently support ignoreUnknownValues or // autoUpdateSchema. - return StorageApiWritePayload.of(element.toByteArray(), null); + return StorageApiWritePayload.of( + element.toByteArray(), + null, + formatRecordOnFailureFunction != null ? toFailsafeTableRow(element) : null); } @Override - public TableRow toTableRow(T element) { - throw new RuntimeException("Not implemented!"); + public TableRow toFailsafeTableRow(T element) { + if (formatRecordOnFailureFunction != null) { + return formatRecordOnFailureFunction.apply(element); + } else { + try { + return TableRowToStorageApiProto.tableRowFromMessage( + DynamicMessage.parseFrom( + TableRowToStorageApiProto.wrapDescriptorProto(descriptorProto), + element.toByteArray()), + true); + } catch (Exception e) { + throw new RuntimeException(e); + } + } } }; diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiDynamicDestinationsTableRow.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiDynamicDestinationsTableRow.java index 264dac34473ed..08588cfc78500 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiDynamicDestinationsTableRow.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiDynamicDestinationsTableRow.java @@ -36,6 +36,7 @@ public class StorageApiDynamicDestinationsTableRow extends StorageApiDynamicDestinations { private final SerializableFunction formatFunction; + private final @Nullable SerializableFunction formatRecordOnFailureFunction; private final boolean usesCdc; private final CreateDisposition createDisposition; @@ -51,12 +52,14 @@ public class StorageApiDynamicDestinationsTableRow inner, SerializableFunction formatFunction, + @Nullable SerializableFunction formatRecordOnFailureFunction, boolean usesCdc, CreateDisposition createDisposition, boolean ignoreUnknownValues, boolean autoSchemaUpdates) { super(inner); this.formatFunction = formatFunction; + this.formatRecordOnFailureFunction = formatRecordOnFailureFunction; this.usesCdc = usesCdc; this.createDisposition = createDisposition; this.ignoreUnknownValues = ignoreUnknownValues; @@ -151,8 +154,12 @@ public DescriptorProtos.DescriptorProto getDescriptor(boolean includeCdcColumns) } @Override - public TableRow toTableRow(T element) { - return formatFunction.apply(element); + public TableRow toFailsafeTableRow(T element) { + if (formatRecordOnFailureFunction != null) { + return formatRecordOnFailureFunction.apply(element); + } else { + return formatFunction.apply(element); + } } @Override @@ -183,7 +190,10 @@ public StorageApiWritePayload toMessage( unknownFields, changeType, changeSequenceNum); - return StorageApiWritePayload.of(msg.toByteArray(), unknownFields); + return StorageApiWritePayload.of( + msg.toByteArray(), + unknownFields, + formatRecordOnFailureFunction != null ? toFailsafeTableRow(element) : null); } }; } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiWritePayload.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiWritePayload.java index 5b6f27949870b..f0fce11b2d32b 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiWritePayload.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiWritePayload.java @@ -39,12 +39,17 @@ public abstract class StorageApiWritePayload { public abstract @Nullable Instant getTimestamp(); + @SuppressWarnings("mutable") + public abstract @Nullable byte[] getFailsafeTableRowPayload(); + @AutoValue.Builder public abstract static class Builder { public abstract Builder setPayload(byte[] value); public abstract Builder setUnknownFieldsPayload(@Nullable byte[] value); + public abstract Builder setFailsafeTableRowPayload(@Nullable byte[] value); + public abstract Builder setTimestamp(@Nullable Instant value); public abstract StorageApiWritePayload build(); @@ -53,15 +58,22 @@ public abstract static class Builder { public abstract Builder toBuilder(); @SuppressWarnings("nullness") - static StorageApiWritePayload of(byte[] payload, @Nullable TableRow unknownFields) + static StorageApiWritePayload of( + byte[] payload, @Nullable TableRow unknownFields, @Nullable TableRow failsafeTableRow) throws IOException { @Nullable byte[] unknownFieldsPayload = null; if (unknownFields != null) { unknownFieldsPayload = CoderUtils.encodeToByteArray(TableRowJsonCoder.of(), unknownFields); } + @Nullable byte[] failsafeTableRowPayload = null; + if (failsafeTableRow != null) { + failsafeTableRowPayload = + CoderUtils.encodeToByteArray(TableRowJsonCoder.of(), failsafeTableRow); + } return new AutoValue_StorageApiWritePayload.Builder() .setPayload(payload) .setUnknownFieldsPayload(unknownFieldsPayload) + .setFailsafeTableRowPayload(failsafeTableRowPayload) .setTimestamp(null) .build(); } @@ -77,4 +89,12 @@ public StorageApiWritePayload withTimestamp(Instant instant) { } return CoderUtils.decodeFromByteArray(TableRowJsonCoder.of(), fields); } + + public @Memoized @Nullable TableRow getFailsafeTableRow() throws IOException { + @Nullable byte[] failsafeTableRowPayload = getFailsafeTableRowPayload(); + if (failsafeTableRowPayload == null) { + return null; + } + return CoderUtils.decodeFromByteArray(TableRowJsonCoder.of(), failsafeTableRowPayload); + } } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiWriteUnshardedRecords.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiWriteUnshardedRecords.java index 8a902ec6d264e..369bb2d78634c 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiWriteUnshardedRecords.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiWriteUnshardedRecords.java @@ -18,6 +18,7 @@ package org.apache.beam.sdk.io.gcp.bigquery; import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkArgument; +import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkNotNull; import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkState; import com.google.api.core.ApiFuture; @@ -257,15 +258,20 @@ static class AppendRowsContext extends RetryManager.Operation.Context timestamps; + List<@Nullable TableRow> failsafeTableRows; int failureCount; public AppendRowsContext( - long offset, ProtoRows protoRows, List timestamps) { + long offset, + ProtoRows protoRows, + List timestamps, + List<@Nullable TableRow> failsafeTableRows) { this.offset = offset; this.protoRows = protoRows; this.timestamps = timestamps; this.failureCount = 0; + this.failsafeTableRows = failsafeTableRows; } } @@ -278,6 +284,7 @@ class DestinationState { private long currentOffset = 0; private List pendingMessages; private List pendingTimestamps; + private List<@Nullable TableRow> pendingFailsafeTableRows; private transient @Nullable WriteStreamService maybeWriteStreamService; private final Counter recordsAppended = Metrics.counter(WriteRecordsDoFn.class, "recordsAppended"); @@ -319,6 +326,7 @@ public DestinationState( this.shortTableUrn = shortTableUrn; this.pendingMessages = Lists.newArrayList(); this.pendingTimestamps = Lists.newArrayList(); + this.pendingFailsafeTableRows = Lists.newArrayList(); this.maybeWriteStreamService = writeStreamService; this.useDefaultStream = useDefaultStream; this.initialTableSchema = messageConverter.getTableSchema(); @@ -553,6 +561,7 @@ void addMessage( throws Exception { maybeTickleCache(); ByteString payloadBytes = ByteString.copyFrom(payload.getPayload()); + @Nullable TableRow failsafeTableRow = payload.getFailsafeTableRow(); if (autoUpdateSchema) { if (appendClientInfo == null) { appendClientInfo = getAppendClientInfo(true, null); @@ -565,7 +574,10 @@ void addMessage( Preconditions.checkStateNotNull(appendClientInfo) .encodeUnknownFields(unknownFields, ignoreUnknownValues)); } catch (TableRowToStorageApiProto.SchemaConversionException e) { - TableRow tableRow = appendClientInfo.toTableRow(payloadBytes); + @Nullable TableRow tableRow = payload.getFailsafeTableRow(); + if (tableRow == null) { + tableRow = checkNotNull(appendClientInfo).toTableRow(payloadBytes); + } // TODO(24926, reuvenlax): We need to merge the unknown fields in! Currently we only // execute this // codepath when ignoreUnknownFields==true, so we should never hit this codepath. @@ -583,6 +595,8 @@ void addMessage( } } pendingMessages.add(payloadBytes); + pendingFailsafeTableRows.add(failsafeTableRow); + org.joda.time.Instant timestamp = payload.getTimestamp(); pendingTimestamps.add(timestamp != null ? timestamp : elementTs); } @@ -601,7 +615,9 @@ long flush( pendingMessages.clear(); final ProtoRows inserts = insertsBuilder.build(); List insertTimestamps = pendingTimestamps; + List<@Nullable TableRow> failsafeTableRows = pendingFailsafeTableRows; pendingTimestamps = Lists.newArrayList(); + pendingFailsafeTableRows = Lists.newArrayList(); // Handle the case where the request is too large. if (inserts.getSerializedSize() >= maxRequestSize) { @@ -616,15 +632,18 @@ long flush( maxRequestSize); } for (int i = 0; i < inserts.getSerializedRowsCount(); ++i) { - ByteString rowBytes = inserts.getSerializedRows(i); + @Nullable TableRow failedRow = failsafeTableRows.get(i); + if (failedRow == null) { + ByteString rowBytes = inserts.getSerializedRows(i); + failedRow = + TableRowToStorageApiProto.tableRowFromMessage( + DynamicMessage.parseFrom( + TableRowToStorageApiProto.wrapDescriptorProto( + getAppendClientInfo(true, null).getDescriptor()), + rowBytes), + true); + } org.joda.time.Instant timestamp = insertTimestamps.get(i); - TableRow failedRow = - TableRowToStorageApiProto.tableRowFromMessage( - DynamicMessage.parseFrom( - TableRowToStorageApiProto.wrapDescriptorProto( - getAppendClientInfo(true, null).getDescriptor()), - rowBytes), - true); failedRowsReceiver.outputWithTimestamp( new BigQueryStorageApiInsertError( failedRow, "Row payload too large. Maximum size " + maxRequestSize), @@ -647,7 +666,7 @@ long flush( this.currentOffset += inserts.getSerializedRowsCount(); } AppendRowsContext appendRowsContext = - new AppendRowsContext(offset, inserts, insertTimestamps); + new AppendRowsContext(offset, inserts, insertTimestamps, failsafeTableRows); retryManager.addOperation( c -> { @@ -692,18 +711,22 @@ long flush( Set failedRowIndices = error.getRowIndexToErrorMessage().keySet(); for (int failedIndex : failedRowIndices) { // Convert the message to a TableRow and send it to the failedRows collection. - ByteString protoBytes = failedContext.protoRows.getSerializedRows(failedIndex); - org.joda.time.Instant timestamp = failedContext.timestamps.get(failedIndex); BigQueryStorageApiInsertError element = null; + org.joda.time.Instant timestamp = failedContext.timestamps.get(failedIndex); try { - TableRow failedRow = - TableRowToStorageApiProto.tableRowFromMessage( - DynamicMessage.parseFrom( - TableRowToStorageApiProto.wrapDescriptorProto( - Preconditions.checkStateNotNull(appendClientInfo) - .getDescriptor()), - protoBytes), - true); + TableRow failedRow = failedContext.failsafeTableRows.get(failedIndex); + if (failedRow == null) { + ByteString protoBytes = + failedContext.protoRows.getSerializedRows(failedIndex); + failedRow = + TableRowToStorageApiProto.tableRowFromMessage( + DynamicMessage.parseFrom( + TableRowToStorageApiProto.wrapDescriptorProto( + Preconditions.checkStateNotNull(appendClientInfo) + .getDescriptor()), + protoBytes), + true); + } element = new BigQueryStorageApiInsertError( failedRow, error.getRowIndexToErrorMessage().get(failedIndex)); diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiWritesShardedRecords.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiWritesShardedRecords.java index a7da19a75f850..f3f512110b50f 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiWritesShardedRecords.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiWritesShardedRecords.java @@ -149,12 +149,17 @@ class AppendRowsContext extends RetryManager.Operation.Context timestamps; + List<@Nullable TableRow> failsafeTableRows; AppendRowsContext( - ShardedKey key, ProtoRows protoRows, List timestamps) { + ShardedKey key, + ProtoRows protoRows, + List timestamps, + List<@Nullable TableRow> failsafeTableRows) { this.key = key; this.protoRows = protoRows; this.timestamps = timestamps; + this.failsafeTableRows = failsafeTableRows; } @Override @@ -685,8 +690,11 @@ public void process( Set failedRowIndices = error.getRowIndexToErrorMessage().keySet(); for (int failedIndex : failedRowIndices) { // Convert the message to a TableRow and send it to the failedRows collection. - ByteString protoBytes = failedContext.protoRows.getSerializedRows(failedIndex); - TableRow failedRow = appendClientInfo.get().toTableRow(protoBytes); + TableRow failedRow = failedContext.failsafeTableRows.get(failedIndex); + if (failedRow == null) { + ByteString protoBytes = failedContext.protoRows.getSerializedRows(failedIndex); + failedRow = appendClientInfo.get().toTableRow(protoBytes); + } org.joda.time.Instant timestamp = failedContext.timestamps.get(failedIndex); o.get(failedRowsTag) .outputWithTimestamp( @@ -851,9 +859,12 @@ public void process( + ". This is unexpected. All rows in the request will be sent to the failed-rows PCollection."); } for (int i = 0; i < splitValue.getProtoRows().getSerializedRowsCount(); ++i) { - ByteString rowBytes = splitValue.getProtoRows().getSerializedRows(i); org.joda.time.Instant timestamp = splitValue.getTimestamps().get(i); - TableRow failedRow = appendClientInfo.get().toTableRow(rowBytes); + TableRow failedRow = splitValue.getFailsafeTableRows().get(i); + if (failedRow == null) { + ByteString rowBytes = splitValue.getProtoRows().getSerializedRows(i); + failedRow = appendClientInfo.get().toTableRow(rowBytes); + } o.get(failedRowsTag) .outputWithTimestamp( new BigQueryStorageApiInsertError( @@ -872,7 +883,10 @@ public void process( // RetryManager AppendRowsContext context = new AppendRowsContext( - element.getKey(), splitValue.getProtoRows(), splitValue.getTimestamps()); + element.getKey(), + splitValue.getProtoRows(), + splitValue.getTimestamps(), + splitValue.getFailsafeTableRows()); contexts.add(context); retryManager.addOperation(runOperation, onError, onSuccess, context); recordsAppended.inc(splitValue.getProtoRows().getSerializedRowsCount()); diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOWriteTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOWriteTest.java index c5af8045bfe20..2736ed7beb881 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOWriteTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOWriteTest.java @@ -83,6 +83,7 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; +import java.util.stream.IntStream; import java.util.stream.LongStream; import java.util.stream.StreamSupport; import org.apache.avro.Schema.Field; @@ -3065,7 +3066,312 @@ public void testStreamingInsertsExtendedErrorRetrieval() throws Exception { } @Test - public void testStorageApiErrors() throws Exception { + public void testStorageApiErrorsWriteProto() throws Exception { + assumeTrue(useStorageApi); + final Method method = + useStorageApiApproximate ? Method.STORAGE_API_AT_LEAST_ONCE : Method.STORAGE_WRITE_API; + + final int failFrom = 10; + + Function getPrimitive = + (Integer i) -> + Proto3SchemaMessages.Primitive.newBuilder() + .setPrimitiveDouble(i) + .setPrimitiveFloat(i) + .setPrimitiveInt32(i) + .setPrimitiveInt64(i) + .setPrimitiveUint32(i) + .setPrimitiveUint64(i) + .setPrimitiveSint32(i) + .setPrimitiveSint64(i) + .setPrimitiveFixed32(i) + .setPrimitiveFixed64(i) + .setPrimitiveBool(true) + .setPrimitiveString(Integer.toString(i)) + .setPrimitiveBytes( + ByteString.copyFrom(Integer.toString(i).getBytes(StandardCharsets.UTF_8))) + .build(); + List goodRows = + IntStream.range(1, 20).mapToObj(getPrimitive::apply).collect(Collectors.toList()); + + Function getPrimitiveRow = + (Integer i) -> + new TableRow() + .set("primitive_double", Double.valueOf(i)) + .set("primitive_float", Float.valueOf(i).doubleValue()) + .set("primitive_int32", i.intValue()) + .set("primitive_int64", i.toString()) + .set("primitive_uint32", i.toString()) + .set("primitive_uint64", i.toString()) + .set("primitive_sint32", i.toString()) + .set("primitive_sint64", i.toString()) + .set("primitive_fixed32", i.toString()) + .set("primitive_fixed64", i.toString()) + .set("primitive_bool", true) + .set("primitive_string", i.toString()) + .set( + "primitive_bytes", + BaseEncoding.base64() + .encode( + ByteString.copyFrom(i.toString().getBytes(StandardCharsets.UTF_8)) + .toByteArray())); + + Function shouldFailRow = + (Function & Serializable) + tr -> + tr.containsKey("primitive_int32") + && (Integer) tr.get("primitive_int32") >= failFrom; + fakeDatasetService.setShouldFailRow(shouldFailRow); + + SerializableFunction formatRecordOnFailureFunction = + input -> { + TableRow failedTableRow = new TableRow().set("testFailureFunctionField", "testValue"); + failedTableRow.set("originalValue", input.getPrimitiveFixed32()); + return failedTableRow; + }; + + WriteResult result = + p.apply(Create.of(goodRows)) + .apply( + BigQueryIO.writeProtos(Proto3SchemaMessages.Primitive.class) + .to("project-id:dataset-id.table") + .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) + .withMethod(method) + .withoutValidation() + .withFormatRecordOnFailureFunction(formatRecordOnFailureFunction) + .withPropagateSuccessfulStorageApiWrites(true) + .withTestServices(fakeBqServices)); + + PCollection deadRows = + result + .getFailedStorageApiInserts() + .apply( + MapElements.into(TypeDescriptor.of(TableRow.class)) + .via(BigQueryStorageApiInsertError::getRow)); + + List expectedFailedRows = + goodRows.stream() + .filter(primitive -> primitive.getPrimitiveFixed32() >= failFrom) + .map(formatRecordOnFailureFunction::apply) + .collect(Collectors.toList()); + PAssert.that(deadRows).containsInAnyOrder(expectedFailedRows); + p.run(); + + // Round trip through the coder to make sure the types match our expected types. + assertThat( + fakeDatasetService.getAllRows("project-id", "dataset-id", "table").stream() + .map( + tr -> { + try { + byte[] bytes = CoderUtils.encodeToByteArray(TableRowJsonCoder.of(), tr); + return CoderUtils.decodeFromByteArray(TableRowJsonCoder.of(), bytes); + } catch (Exception e) { + throw new RuntimeException(e); + } + }) + .collect(Collectors.toList()), + containsInAnyOrder( + Iterables.toArray( + Iterables.filter( + goodRows.stream() + .map(primitive -> getPrimitiveRow.apply(primitive.getPrimitiveFixed32())) + .collect(Collectors.toList()), + r -> !shouldFailRow.apply(r)), + TableRow.class))); + } + + @Test + public void testStorageApiErrorsWriteBeamRow() throws Exception { + assumeTrue(useStorageApi); + final Method method = + useStorageApiApproximate ? Method.STORAGE_API_AT_LEAST_ONCE : Method.STORAGE_WRITE_API; + + final int failFrom = 10; + final String shouldFailName = "failme"; + + List goodRows = + Lists.newArrayList( + new SchemaPojo("a", 1), + new SchemaPojo("b", 2), + new SchemaPojo("c", 10), + new SchemaPojo("d", 11), + new SchemaPojo(shouldFailName, 1)); + + String nameField = "name"; + String numberField = "number"; + Function shouldFailRow = + (Function & Serializable) + tr -> + shouldFailName.equals(tr.get(nameField)) + || (Integer.valueOf((String) tr.get(numberField)) >= failFrom); + fakeDatasetService.setShouldFailRow(shouldFailRow); + + SerializableFunction formatRecordOnFailureFunction = + input -> { + TableRow failedTableRow = new TableRow().set("testFailureFunctionField", "testValue"); + failedTableRow.set("originalName", input.name); + failedTableRow.set("originalNumber", input.number); + return failedTableRow; + }; + + WriteResult result = + p.apply(Create.of(goodRows)) + .apply( + BigQueryIO.write() + .to("project-id:dataset-id.table") + .withMethod(method) + .useBeamSchema() + .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED) + .withFailedInsertRetryPolicy(InsertRetryPolicy.retryTransientErrors()) + .withPropagateSuccessfulStorageApiWrites(true) + .withTestServices(fakeBqServices) + .withFormatRecordOnFailureFunction(formatRecordOnFailureFunction) + .withoutValidation()); + + PCollection deadRows = + result + .getFailedStorageApiInserts() + .apply( + MapElements.into(TypeDescriptor.of(TableRow.class)) + .via(BigQueryStorageApiInsertError::getRow)); + PCollection successfulRows = result.getSuccessfulStorageApiInserts(); + + List expectedFailedRows = + goodRows.stream() + .filter(pojo -> shouldFailName.equals(pojo.name) || pojo.number >= failFrom) + .map(formatRecordOnFailureFunction::apply) + .collect(Collectors.toList()); + PAssert.that(deadRows).containsInAnyOrder(expectedFailedRows); + PAssert.that(successfulRows) + .containsInAnyOrder( + Iterables.toArray( + Iterables.filter( + goodRows.stream() + .map( + pojo -> { + TableRow tableRow = new TableRow(); + tableRow.set(nameField, pojo.name); + tableRow.set(numberField, String.valueOf(pojo.number)); + return tableRow; + }) + .collect(Collectors.toList()), + r -> !shouldFailRow.apply(r)), + TableRow.class)); + p.run(); + + assertThat( + fakeDatasetService.getAllRows("project-id", "dataset-id", "table"), + containsInAnyOrder( + Iterables.toArray( + Iterables.filter( + goodRows.stream() + .map( + pojo -> { + TableRow tableRow = new TableRow(); + tableRow.set(nameField, pojo.name); + tableRow.set(numberField, String.valueOf(pojo.number)); + return tableRow; + }) + .collect(Collectors.toList()), + r -> !shouldFailRow.apply(r)), + TableRow.class))); + } + + @Test + public void testStorageApiErrorsWriteGenericRecord() throws Exception { + assumeTrue(useStorageApi); + final Method method = + useStorageApiApproximate ? Method.STORAGE_API_AT_LEAST_ONCE : Method.STORAGE_WRITE_API; + + final long failFrom = 10L; + List goodRows = LongStream.range(0, 20).boxed().collect(Collectors.toList()); + + String fieldName = "number"; + Function shouldFailRow = + (Function & Serializable) + tr -> (Long.valueOf((String) tr.get(fieldName))) >= failFrom; + fakeDatasetService.setShouldFailRow(shouldFailRow); + + SerializableFunction formatRecordOnFailureFunction = + input -> { + TableRow failedTableRow = new TableRow().set("testFailureFunctionField", "testValue"); + failedTableRow.set("originalElement", input); + return failedTableRow; + }; + + WriteResult result = + p.apply(Create.of(goodRows)) + .apply( + BigQueryIO.write() + .to("project-id:dataset-id.table") + .withMethod(method) + .withAvroFormatFunction( + (SerializableFunction, GenericRecord>) + input -> + new GenericRecordBuilder(avroSchema) + .set(fieldName, input.getElement()) + .build()) + .withSchema( + new TableSchema() + .setFields( + ImmutableList.of( + new TableFieldSchema().setName(fieldName).setType("INTEGER")))) + .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED) + .withFailedInsertRetryPolicy(InsertRetryPolicy.retryTransientErrors()) + .withPropagateSuccessfulStorageApiWrites(true) + .withTestServices(fakeBqServices) + .withFormatRecordOnFailureFunction(formatRecordOnFailureFunction) + .withoutValidation()); + + PCollection deadRows = + result + .getFailedStorageApiInserts() + .apply( + MapElements.into(TypeDescriptor.of(TableRow.class)) + .via(BigQueryStorageApiInsertError::getRow)); + PCollection successfulRows = result.getSuccessfulStorageApiInserts(); + + List expectedFailedRows = + goodRows.stream() + .filter(l -> l >= failFrom) + .map(formatRecordOnFailureFunction::apply) + .collect(Collectors.toList()); + PAssert.that(deadRows).containsInAnyOrder(expectedFailedRows); + PAssert.that(successfulRows) + .containsInAnyOrder( + Iterables.toArray( + Iterables.filter( + goodRows.stream() + .map( + l -> { + TableRow tableRow = new TableRow(); + tableRow.set(fieldName, String.valueOf(l)); + return tableRow; + }) + .collect(Collectors.toList()), + r -> !shouldFailRow.apply(r)), + TableRow.class)); + p.run(); + + assertThat( + fakeDatasetService.getAllRows("project-id", "dataset-id", "table"), + containsInAnyOrder( + Iterables.toArray( + Iterables.filter( + goodRows.stream() + .map( + l -> { + TableRow tableRow = new TableRow(); + tableRow.set(fieldName, String.valueOf(l)); + return tableRow; + }) + .collect(Collectors.toList()), + r -> !shouldFailRow.apply(r)), + TableRow.class))); + } + + @Test + public void testStorageApiErrorsWriteTableRows() throws Exception { assumeTrue(useStorageApi); final Method method = useStorageApiApproximate ? Method.STORAGE_API_AT_LEAST_ONCE : Method.STORAGE_WRITE_API; @@ -3132,6 +3438,22 @@ public void testStorageApiErrors() throws Exception { tr -> tr.containsKey("name") && tr.get("name").equals(failValue); fakeDatasetService.setShouldFailRow(shouldFailRow); + SerializableFunction formatRecordOnFailureFunction = + input -> { + TableRow failedTableRow = new TableRow().set("testFailureFunctionField", "testValue"); + if (input != null) { + Object name = input.get("name"); + if (name != null) { + failedTableRow.set("name", name); + } + Object number = input.get("number"); + if (number != null) { + failedTableRow.set("number", number); + } + } + return failedTableRow; + }; + WriteResult result = p.apply(Create.of(Iterables.concat(goodRows, badRows))) .apply( @@ -3143,6 +3465,7 @@ public void testStorageApiErrors() throws Exception { .withFailedInsertRetryPolicy(InsertRetryPolicy.retryTransientErrors()) .withPropagateSuccessfulStorageApiWrites(true) .withTestServices(fakeBqServices) + .withFormatRecordOnFailureFunction(formatRecordOnFailureFunction) .withoutValidation()); PCollection deadRows = @@ -3153,9 +3476,14 @@ public void testStorageApiErrors() throws Exception { .via(BigQueryStorageApiInsertError::getRow)); PCollection successfulRows = result.getSuccessfulStorageApiInserts(); - PAssert.that(deadRows) - .containsInAnyOrder( - Iterables.concat(badRows, Iterables.filter(goodRows, shouldFailRow::apply))); + List expectedFailedRows = + badRows.stream().map(formatRecordOnFailureFunction::apply).collect(Collectors.toList()); + expectedFailedRows.addAll( + goodRows.stream() + .filter(shouldFailRow::apply) + .map(formatRecordOnFailureFunction::apply) + .collect(Collectors.toList())); + PAssert.that(deadRows).containsInAnyOrder(expectedFailedRows); PAssert.that(successfulRows) .containsInAnyOrder( Iterables.toArray( From b0f2683cda15c1b308bd583bca99be992fdee79b Mon Sep 17 00:00:00 2001 From: Kiruphasankaran Nataraj Date: Tue, 13 Aug 2024 20:01:46 +0530 Subject: [PATCH 71/78] GitHub issue #30257 Adds a static comparing method to the SerializableComparator interface --- .../transforms/SerializableComparator.java | 21 ++++++- .../SerializableComparatorTest.java | 63 +++++++++++++++++++ 2 files changed, 83 insertions(+), 1 deletion(-) create mode 100644 sdks/java/core/src/test/java/org/apache/beam/sdk/transforms/SerializableComparatorTest.java diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/SerializableComparator.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/SerializableComparator.java index c66fbb7d7497b..16304633c993b 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/SerializableComparator.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/transforms/SerializableComparator.java @@ -19,10 +19,29 @@ import java.io.Serializable; import java.util.Comparator; +import java.util.Objects; +import java.util.function.Function; /** * A {@code Comparator} that is also {@code Serializable}. * * @param type of values being compared */ -public interface SerializableComparator extends Comparator, Serializable {} +public interface SerializableComparator extends Comparator, Serializable { + /** + * Analogous to {@link Comparator#comparing(Function)}, except that it takes in a {@link + * SerializableFunction} as the key extractor and returns a {@link SerializableComparator}. + * + * @param keyExtractor the function used to extract the {@link java.lang.Comparable} sort key + * @return A {@link SerializableComparator} that compares by an extracted key + * @param the type of element to be compared + * @param the type of the {@code Comparable} sort key + * @see Comparator#comparing(Function) + */ + static > SerializableComparator comparing( + SerializableFunction keyExtractor) { + Objects.requireNonNull(keyExtractor); + return (SerializableComparator) + (c1, c2) -> keyExtractor.apply(c1).compareTo(keyExtractor.apply(c2)); + } +} diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/transforms/SerializableComparatorTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/transforms/SerializableComparatorTest.java new file mode 100644 index 0000000000000..09583ec44f28c --- /dev/null +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/transforms/SerializableComparatorTest.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.transforms; + +import java.io.Serializable; +import java.util.function.Function; +import org.apache.beam.sdk.util.SerializableUtils; +import org.junit.Assert; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +/** Tests for {@link SerializableComparator}. */ +@RunWith(JUnit4.class) +public class SerializableComparatorTest { + + /** + * Tests if the {@link SerializableComparator} returned by {@link + * SerializableComparator#comparing(SerializableFunction)} using {@link + * SerializableUtils#ensureSerializable(Serializable)}. + */ + @Test + public void testSerializable() { + SerializableFunction fn = Integer::parseInt; + + SerializableComparator cmp = SerializableComparator.comparing(fn); + SerializableUtils.ensureSerializable(cmp); + } + + /** + * Tests if {@link SerializableComparator#comparing(Function)} throws a {@link + * java.lang.NullPointerException} if null is passed to it. + */ + @Test(expected = NullPointerException.class) + public void testIfNPEThrownForNullFunction() { + SerializableComparator.comparing(null); + } + + /** Tests the basic comparison function of the {@link SerializableComparator} returned. */ + @Test + public void testBasicComparison() { + SerializableFunction fn = Integer::parseInt; + SerializableComparator cmp = SerializableComparator.comparing(fn); + + Assert.assertTrue(cmp.compare("1", "10") < 0); + Assert.assertTrue(cmp.compare("9", "6") > 0); + } +} From b2d26b6b5f376db079679d620a812af25c4a90f8 Mon Sep 17 00:00:00 2001 From: Danny McCormick Date: Tue, 13 Aug 2024 17:43:30 +0200 Subject: [PATCH 72/78] Fix upload_graph on v2 (#32165) * Fix upload_graph on v2 * compliation nits * compliation nits * remove streaming test change, update CHANGES * mutability fix * Test fix * Remove upload_graph from it --- CHANGES.md | 10 ++++++++++ .../beam/runners/dataflow/DataflowRunner.java | 12 +++++++++++- .../beam/runners/dataflow/DataflowRunnerTest.java | 13 +++++++++++++ 3 files changed, 34 insertions(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index 950abc694488e..fce3aa26a72b7 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -83,6 +83,7 @@ ## Bugfixes * Fixed incorrect service account impersonation flow for Python pipelines using BigQuery IOs ([#32030](https://github.com/apache/beam/issues/32030)). +* Auto-disable broken and meaningless `upload_graph` feature when using Dataflow Runner V2 ([#32159](https://github.com/apache/beam/issues/32159)). ## Security Fixes * Fixed (CVE-YYYY-NNNN)[https://www.cve.org/CVERecord?id=CVE-YYYY-NNNN] (Java/Python/Go) ([#X](https://github.com/apache/beam/issues/X)). @@ -116,6 +117,10 @@ * [BigQueryIO] Fixed a bug in batch Storage Write API that frequently exhausted concurrent connections quota ([#31710](https://github.com/apache/beam/pull/31710)) * Fixed a logging issue where Python worker dependency installation logs sometimes were not emitted in a timely manner ([#31977](https://github.com/apache/beam/pull/31977)) +## Known Issues + +* Large Dataflow graphs using runner v2, or pipelines explicitly enabling the `upload_graph` experiment, will fail at construction time ([#32159](https://github.com/apache/beam/issues/32159)). + # [2.57.0] - 2024-06-26 ## Highlights @@ -167,6 +172,10 @@ jackson-2.15 has known breaking changes. An important one is it imposed a buffer limit for parser. If your custom PTransform/DoFn are affected, refer to [#31580](https://github.com/apache/beam/pull/31580) for mitigation. +## Known Issues + +* Large Dataflow graphs using runner v2, or pipelines explicitly enabling the `upload_graph` experiment, will fail at construction time ([#32159](https://github.com/apache/beam/issues/32159)). + # [2.56.0] - 2024-05-01 ## Highlights @@ -202,6 +211,7 @@ * The beam interactive runner does not correctly run on flink ([#31168](https://github.com/apache/beam/issues/31168)). * When using the Flink runner from Python, 1.17 is not supported and 1.12/13 do not work correctly. Support for 1.17 will be added in 2.57.0, and the ability to choose 1.12/13 will be cleaned up and fully removed in 2.57.0 as well ([#31168](https://github.com/apache/beam/issues/31168)). +* Large Dataflow graphs using runner v2, or pipelines explicitly enabling the `upload_graph` experiment, will fail at construction time ([#32159](https://github.com/apache/beam/issues/32159)). # [2.55.1] - 2024-04-08 diff --git a/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowRunner.java b/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowRunner.java index 708c63413268a..abe7d0d364d3f 100644 --- a/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowRunner.java +++ b/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowRunner.java @@ -1385,7 +1385,8 @@ public DataflowPipelineJob run(Pipeline pipeline) { byte[] jobGraphBytes = DataflowPipelineTranslator.jobToString(newJob).getBytes(UTF_8); int jobGraphByteSize = jobGraphBytes.length; if (jobGraphByteSize >= CREATE_JOB_REQUEST_LIMIT_BYTES - && !hasExperiment(options, "upload_graph")) { + && !hasExperiment(options, "upload_graph") + && !useUnifiedWorker(options)) { List experiments = firstNonNull(options.getExperiments(), Collections.emptyList()); options.setExperiments( ImmutableList.builder().addAll(experiments).add("upload_graph").build()); @@ -1396,6 +1397,15 @@ public DataflowPipelineJob run(Pipeline pipeline) { CREATE_JOB_REQUEST_LIMIT_BYTES); } + if (hasExperiment(options, "upload_graph") && useUnifiedWorker(options)) { + ArrayList experiments = new ArrayList<>(options.getExperiments()); + while (experiments.remove("upload_graph")) {} + options.setExperiments(experiments); + LOG.warn( + "The upload_graph experiment was specified, but it does not apply " + + "to runner v2 jobs. Option has been automatically removed."); + } + // Upload the job to GCS and remove the graph object from the API call. The graph // will be downloaded from GCS by the service. if (hasExperiment(options, "upload_graph")) { diff --git a/runners/google-cloud-dataflow-java/src/test/java/org/apache/beam/runners/dataflow/DataflowRunnerTest.java b/runners/google-cloud-dataflow-java/src/test/java/org/apache/beam/runners/dataflow/DataflowRunnerTest.java index cf1066e41d25e..37c20c61ad8e5 100644 --- a/runners/google-cloud-dataflow-java/src/test/java/org/apache/beam/runners/dataflow/DataflowRunnerTest.java +++ b/runners/google-cloud-dataflow-java/src/test/java/org/apache/beam/runners/dataflow/DataflowRunnerTest.java @@ -838,6 +838,19 @@ public void testUploadGraph() throws IOException { .startsWith("gs://valid-bucket/temp/staging/dataflow_graph")); } + @Test + public void testUploadGraphV2IsNoOp() throws IOException { + DataflowPipelineOptions options = buildPipelineOptions(); + options.setExperiments(Arrays.asList("upload_graph", "use_runner_v2")); + Pipeline p = buildDataflowPipeline(options); + p.run(); + + ArgumentCaptor jobCaptor = ArgumentCaptor.forClass(Job.class); + Mockito.verify(mockJobs).create(eq(PROJECT_ID), eq(REGION_ID), jobCaptor.capture()); + assertValidJob(jobCaptor.getValue()); + assertNull(jobCaptor.getValue().getStepsLocation()); + } + /** Test for automatically using upload_graph when the job graph is too large (>10MB). */ @Test public void testUploadGraphWithAutoUpload() throws IOException { From ab81e1fc5e9f10a955bb56ca21675004af4ba180 Mon Sep 17 00:00:00 2001 From: Shunping Huang Date: Tue, 13 Aug 2024 14:51:13 -0400 Subject: [PATCH 73/78] Added a data corruption known issue to CHANGES.md and release blogs. (#32166) * Added a data corruption known issue to CHANGES.md and release blogs. * Update issue link * Update issue link in release blogs --- CHANGES.md | 7 +++++++ website/www/site/content/en/blog/beam-2.53.0.md | 1 + website/www/site/content/en/blog/beam-2.54.0.md | 1 + website/www/site/content/en/blog/beam-2.55.0.md | 3 +++ website/www/site/content/en/blog/beam-2.56.0.md | 6 ++++++ website/www/site/content/en/blog/beam-2.57.0.md | 4 ++++ website/www/site/content/en/blog/beam-2.58.0.md | 6 ++++++ 7 files changed, 28 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index fce3aa26a72b7..cf2478e02358f 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -84,6 +84,7 @@ * Fixed incorrect service account impersonation flow for Python pipelines using BigQuery IOs ([#32030](https://github.com/apache/beam/issues/32030)). * Auto-disable broken and meaningless `upload_graph` feature when using Dataflow Runner V2 ([#32159](https://github.com/apache/beam/issues/32159)). +* (Python) Upgraded google-cloud-storage to version 2.18.2 to fix a data corruption issue ([#32135](https://github.com/apache/beam/pull/32135)). ## Security Fixes * Fixed (CVE-YYYY-NNNN)[https://www.cve.org/CVERecord?id=CVE-YYYY-NNNN] (Java/Python/Go) ([#X](https://github.com/apache/beam/issues/X)). @@ -120,6 +121,7 @@ ## Known Issues * Large Dataflow graphs using runner v2, or pipelines explicitly enabling the `upload_graph` experiment, will fail at construction time ([#32159](https://github.com/apache/beam/issues/32159)). +* Python pipelines that run with 2.53.0-2.58.0 SDKs and read data from GCS might be affected by a data corruption issue ([#32169](https://github.com/apache/beam/issues/32169)). The issue will be fixed in 2.59.0 ([#32135](https://github.com/apache/beam/pull/32135)). To work around this, update the google-cloud-storage package to version 2.18.2 or newer. # [2.57.0] - 2024-06-26 @@ -175,6 +177,7 @@ ## Known Issues * Large Dataflow graphs using runner v2, or pipelines explicitly enabling the `upload_graph` experiment, will fail at construction time ([#32159](https://github.com/apache/beam/issues/32159)). +* Python pipelines that run with 2.53.0-2.58.0 SDKs and read data from GCS might be affected by a data corruption issue ([#32169](https://github.com/apache/beam/issues/32169)). The issue will be fixed in 2.59.0 ([#32135](https://github.com/apache/beam/pull/32135)). To work around this, update the google-cloud-storage package to version 2.18.2 or newer. # [2.56.0] - 2024-05-01 @@ -212,6 +215,7 @@ * The beam interactive runner does not correctly run on flink ([#31168](https://github.com/apache/beam/issues/31168)). * When using the Flink runner from Python, 1.17 is not supported and 1.12/13 do not work correctly. Support for 1.17 will be added in 2.57.0, and the ability to choose 1.12/13 will be cleaned up and fully removed in 2.57.0 as well ([#31168](https://github.com/apache/beam/issues/31168)). * Large Dataflow graphs using runner v2, or pipelines explicitly enabling the `upload_graph` experiment, will fail at construction time ([#32159](https://github.com/apache/beam/issues/32159)). +* Python pipelines that run with 2.53.0-2.58.0 SDKs and read data from GCS might be affected by a data corruption issue ([#32169](https://github.com/apache/beam/issues/32169)). The issue will be fixed in 2.59.0 ([#32135](https://github.com/apache/beam/pull/32135)). To work around this, update the google-cloud-storage package to version 2.18.2 or newer. # [2.55.1] - 2024-04-08 @@ -266,6 +270,7 @@ * In Python pipelines, when shutting down inactive bundle processors, shutdown logic can overaggressively hold the lock, blocking acceptance of new work. Symptoms of this issue include slowness or stuckness in long-running jobs. Fixed in 2.56.0 ([#30679](https://github.com/apache/beam/pull/30679)). * WriteToJson broken in languages other than Java (X-lang) ([#30776](https://github.com/apache/beam/issues/30776)). * Python pipelines might occasionally become stuck due to a regression in grpcio ([#30867](https://github.com/apache/beam/issues/30867)). The issue manifests frequently with Bigtable IO connector, but might also affect other GCP connectors. Fixed in 2.56.0. +* Python pipelines that run with 2.53.0-2.58.0 SDKs and read data from GCS might be affected by a data corruption issue ([#32169](https://github.com/apache/beam/issues/32169)). The issue will be fixed in 2.59.0 ([#32135](https://github.com/apache/beam/pull/32135)). To work around this, update the google-cloud-storage package to version 2.18.2 or newer. # [2.54.0] - 2024-02-14 @@ -307,6 +312,7 @@ * Some Python pipelines that run with 2.52.0-2.54.0 SDKs and use large materialized side inputs might be affected by a performance regression. To restore the prior behavior on these SDK versions, supply the `--max_cache_memory_usage_mb=0` pipeline option. ([#30360](https://github.com/apache/beam/issues/30360)). * Python pipelines that run with 2.53.0-2.54.0 SDKs and perform file operations on GCS might be affected by excess HTTP requests. This could lead to a performance regression or a permission issue. ([#28398](https://github.com/apache/beam/issues/28398)) * In Python pipelines, when shutting down inactive bundle processors, shutdown logic can overaggressively hold the lock, blocking acceptance of new work. Symptoms of this issue include slowness or stuckness in long-running jobs. Fixed in 2.56.0 ([#30679](https://github.com/apache/beam/pull/30679)). +* Python pipelines that run with 2.53.0-2.58.0 SDKs and read data from GCS might be affected by a data corruption issue ([#32169](https://github.com/apache/beam/issues/32169)). The issue will be fixed in 2.59.0 ([#32135](https://github.com/apache/beam/pull/32135)). To work around this, update the google-cloud-storage package to version 2.18.2 or newer. # [2.53.0] - 2024-01-04 @@ -351,6 +357,7 @@ * Some Python pipelines that run with 2.52.0-2.54.0 SDKs and use large materialized side inputs might be affected by a performance regression. To restore the prior behavior on these SDK versions, supply the `--max_cache_memory_usage_mb=0` pipeline option. ([#30360](https://github.com/apache/beam/issues/30360)). * Python pipelines that run with 2.53.0-2.54.0 SDKs and perform file operations on GCS might be affected by excess HTTP requests. This could lead to a performance regression or a permission issue. ([#28398](https://github.com/apache/beam/issues/28398)) * In Python pipelines, when shutting down inactive bundle processors, shutdown logic can overaggressively hold the lock, blocking acceptance of new work. Symptoms of this issue include slowness or stuckness in long-running jobs. Fixed in 2.56.0 ([#30679](https://github.com/apache/beam/pull/30679)). +* Python pipelines that run with 2.53.0-2.58.0 SDKs and read data from GCS might be affected by a data corruption issue ([#32169](https://github.com/apache/beam/issues/32169)). The issue will be fixed in 2.59.0 ([#32135](https://github.com/apache/beam/pull/32135)). To work around this, update the google-cloud-storage package to version 2.18.2 or newer. # [2.52.0] - 2023-11-17 diff --git a/website/www/site/content/en/blog/beam-2.53.0.md b/website/www/site/content/en/blog/beam-2.53.0.md index 9a15e86ef3dc8..39f851d1563b0 100644 --- a/website/www/site/content/en/blog/beam-2.53.0.md +++ b/website/www/site/content/en/blog/beam-2.53.0.md @@ -68,6 +68,7 @@ For more information on changes in 2.53.0, check out the [detailed release notes * Some Python pipelines that run with 2.52.0-2.54.0 SDKs and use large materialized side inputs might be affected by a performance regression. To restore the prior behavior on these SDK versions, supply the `--max_cache_memory_usage_mb=0` pipeline option. ([#30360](https://github.com/apache/beam/issues/30360)). * Python pipelines that run with 2.53.0-2.54.0 SDKs and perform file operations on GCS might be affected by excess HTTP requests. This could lead to a performance regression or a permission issue. ([#28398](https://github.com/apache/beam/issues/28398)) * In Python pipelines, when shutting down inactive bundle processors, shutdown logic can overaggressively hold the lock, blocking acceptance of new work. Symptoms of this issue include slowness or stuckness in long-running jobs. Fixed in 2.56.0 ([#30679](https://github.com/apache/beam/pull/30679)). +* Python pipelines that run with 2.53.0-2.58.0 SDKs and read data from GCS might be affected by a data corruption issue ([#32169](https://github.com/apache/beam/issues/32169)). The issue will be fixed in 2.59.0 ([#32135](https://github.com/apache/beam/pull/32135)). To work around this, update the google-cloud-storage package to version 2.18.2 or newer. For the most up to date list of known issues, see https://github.com/apache/beam/blob/master/CHANGES.md diff --git a/website/www/site/content/en/blog/beam-2.54.0.md b/website/www/site/content/en/blog/beam-2.54.0.md index a3d649ec6f3f3..ecef90fe84603 100644 --- a/website/www/site/content/en/blog/beam-2.54.0.md +++ b/website/www/site/content/en/blog/beam-2.54.0.md @@ -64,6 +64,7 @@ For more information on changes in 2.54.0, check out the [detailed release notes * Some Python pipelines that run with 2.52.0-2.54.0 SDKs and use large materialized side inputs might be affected by a performance regression. To restore the prior behavior on these SDK versions, supply the `--max_cache_memory_usage_mb=0` pipeline option. ([#30360](https://github.com/apache/beam/issues/30360)). * Python pipelines that run with 2.53.0-2.54.0 SDKs and perform file operations on GCS might be affected by excess HTTP requests. This could lead to a performance regression or a permission issue. ([#28398](https://github.com/apache/beam/issues/28398)) * In Python pipelines, when shutting down inactive bundle processors, shutdown logic can overaggressively hold the lock, blocking acceptance of new work. Symptoms of this issue include slowness or stuckness in long-running jobs. Fixed in 2.56.0 ([#30679](https://github.com/apache/beam/pull/30679)). +* Python pipelines that run with 2.53.0-2.58.0 SDKs and read data from GCS might be affected by a data corruption issue ([#32169](https://github.com/apache/beam/issues/32169)). The issue will be fixed in 2.59.0 ([#32135](https://github.com/apache/beam/pull/32135)). To work around this, update the google-cloud-storage package to version 2.18.2 or newer. For the most up to date list of known issues, see https://github.com/apache/beam/blob/master/CHANGES.md diff --git a/website/www/site/content/en/blog/beam-2.55.0.md b/website/www/site/content/en/blog/beam-2.55.0.md index 6314dfa928279..2ef05fd781b49 100644 --- a/website/www/site/content/en/blog/beam-2.55.0.md +++ b/website/www/site/content/en/blog/beam-2.55.0.md @@ -74,6 +74,9 @@ For more information on changes in 2.55.0, check out the [detailed release notes ## Known Issues * In Python pipelines, when shutting down inactive bundle processors, shutdown logic can overaggressively hold the lock, blocking acceptance of new work. Symptoms of this issue include slowness or stuckness in long-running jobs. Fixed in 2.56.0 ([#30679](https://github.com/apache/beam/pull/30679)). +* Python pipelines that run with 2.53.0-2.58.0 SDKs and read data from GCS might be affected by a data corruption issue ([#32169](https://github.com/apache/beam/issues/32169)). The issue will be fixed in 2.59.0 ([#32135](https://github.com/apache/beam/pull/32135)). To work around this, update the google-cloud-storage package to version 2.18.2 or newer. + +For the most up to date list of known issues, see https://github.com/apache/beam/blob/master/CHANGES.md ## List of Contributors diff --git a/website/www/site/content/en/blog/beam-2.56.0.md b/website/www/site/content/en/blog/beam-2.56.0.md index 8107b22920f19..3a441536d7027 100644 --- a/website/www/site/content/en/blog/beam-2.56.0.md +++ b/website/www/site/content/en/blog/beam-2.56.0.md @@ -54,6 +54,12 @@ For more information on changes in 2.56.0, check out the [detailed release notes * Fixed locking issue when shutting down inactive bundle processors. Symptoms of this issue include slowness or stuckness in long-running jobs (Python) ([#30679](https://github.com/apache/beam/pull/30679)). * Fixed logging issue that caused silecing the pip output when installing of dependencies provided in `--requirements_file` (Python). +## Known Issues + +* Python pipelines that run with 2.53.0-2.58.0 SDKs and read data from GCS might be affected by a data corruption issue ([#32169](https://github.com/apache/beam/issues/32169)). The issue will be fixed in 2.59.0 ([#32135](https://github.com/apache/beam/pull/32135)). To work around this, update the google-cloud-storage package to version 2.18.2 or newer. + +For the most up to date list of known issues, see https://github.com/apache/beam/blob/master/CHANGES.md + ## List of Contributors According to git shortlog, the following people contributed to the 2.56.0 release. Thank you to all contributors! diff --git a/website/www/site/content/en/blog/beam-2.57.0.md b/website/www/site/content/en/blog/beam-2.57.0.md index 62f2896bba0c0..b583b4ee3c516 100644 --- a/website/www/site/content/en/blog/beam-2.57.0.md +++ b/website/www/site/content/en/blog/beam-2.57.0.md @@ -76,6 +76,10 @@ For more information on changes in 2.57.0, check out the [detailed release notes jackson-2.15 has known breaking changes. An important one is it imposed a buffer limit for parser. If your custom PTransform/DoFn are affected, refer to [#31580](https://github.com/apache/beam/pull/31580) for mitigation. +## Known Issues + +* Python pipelines that run with 2.53.0-2.58.0 SDKs and read data from GCS might be affected by a data corruption issue ([#32169](https://github.com/apache/beam/issues/32169)). The issue will be fixed in 2.59.0 ([#32135](https://github.com/apache/beam/pull/32135)). To work around this, update the google-cloud-storage package to version 2.18.2 or newer. + For the most up to date list of known issues, see https://github.com/apache/beam/blob/master/CHANGES.md ## List of Contributors diff --git a/website/www/site/content/en/blog/beam-2.58.0.md b/website/www/site/content/en/blog/beam-2.58.0.md index 603403cd7fdbc..c5d858091fff8 100644 --- a/website/www/site/content/en/blog/beam-2.58.0.md +++ b/website/www/site/content/en/blog/beam-2.58.0.md @@ -49,6 +49,12 @@ For more information about changes in 2.58.0, check out the [detailed release no * [BigQueryIO] Fixed a bug in batch Storage Write API that frequently exhausted concurrent connections quota ([#31710](https://github.com/apache/beam/pull/31710)) +## Known Issues + +* Python pipelines that run with 2.53.0-2.58.0 SDKs and read data from GCS might be affected by a data corruption issue ([#32169](https://github.com/apache/beam/issues/32169)). The issue will be fixed in 2.59.0 ([#32135](https://github.com/apache/beam/pull/32135)). To work around this, update the google-cloud-storage package to version 2.18.2 or newer. + +For the most up to date list of known issues, see https://github.com/apache/beam/blob/master/CHANGES.md + ## List of Contributors According to git shortlog, the following people contributed to the 2.58.0 release. Thank you to all contributors! From 8ff7f0d75e45aa31bcc56d2bcd38ef49125295aa Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 13 Aug 2024 23:12:52 -0700 Subject: [PATCH 74/78] Bump github.com/docker/docker in /sdks (#32176) Bumps [github.com/docker/docker](https://github.com/docker/docker) from 27.1.1+incompatible to 27.1.2+incompatible. - [Release notes](https://github.com/docker/docker/releases) - [Commits](https://github.com/docker/docker/compare/v27.1.1...v27.1.2) --- updated-dependencies: - dependency-name: github.com/docker/docker dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- sdks/go.mod | 3 ++- sdks/go.sum | 6 ++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/sdks/go.mod b/sdks/go.mod index 7c3ade37490d1..53aa6f23bfaa2 100644 --- a/sdks/go.mod +++ b/sdks/go.mod @@ -93,6 +93,7 @@ require ( github.com/minio/highwayhash v1.0.3 // indirect github.com/moby/docker-image-spec v1.3.1 // indirect github.com/moby/sys/user v0.1.0 // indirect + github.com/moby/sys/userns v0.1.0 // indirect github.com/nats-io/jwt/v2 v2.5.8 // indirect github.com/nats-io/nkeys v0.4.7 // indirect github.com/nats-io/nuid v1.0.1 // indirect @@ -142,7 +143,7 @@ require ( github.com/cncf/xds/go v0.0.0-20240423153145-555b57ec207b // indirect github.com/containerd/containerd v1.7.18 // indirect github.com/cpuguy83/dockercfg v0.3.1 // indirect - github.com/docker/docker v27.1.1+incompatible // but required to resolve issue docker has with go1.20 + github.com/docker/docker v27.1.2+incompatible // but required to resolve issue docker has with go1.20 github.com/docker/go-units v0.5.0 // indirect github.com/envoyproxy/go-control-plane v0.12.0 // indirect github.com/envoyproxy/protoc-gen-validate v1.0.4 // indirect diff --git a/sdks/go.sum b/sdks/go.sum index 5b9bfaef8a48d..f49007e6b7d9a 100644 --- a/sdks/go.sum +++ b/sdks/go.sum @@ -779,8 +779,8 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5QvfrDyIgxBk= github.com/distribution/reference v0.6.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E= -github.com/docker/docker v27.1.1+incompatible h1:hO/M4MtV36kzKldqnA37IWhebRA+LnqqcqDja6kVaKY= -github.com/docker/docker v27.1.1+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= +github.com/docker/docker v27.1.2+incompatible h1:AhGzR1xaQIy53qCkxARaFluI00WPGtXn0AJuoQsVYTY= +github.com/docker/docker v27.1.2+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= github.com/docker/go-connections v0.5.0 h1:USnMq7hx7gwdVZq1L49hLXaFtUdTADjXGp+uj1Br63c= github.com/docker/go-connections v0.5.0/go.mod h1:ov60Kzw0kKElRwhNs9UlUHAE/F9Fe6GLaXnqyDdmEXc= github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4= @@ -1065,6 +1065,8 @@ github.com/moby/sys/sequential v0.5.0 h1:OPvI35Lzn9K04PBbCLW0g4LcFAJgHsvXsRyewg5 github.com/moby/sys/sequential v0.5.0/go.mod h1:tH2cOOs5V9MlPiXcQzRC+eEyab644PWKGRYaaV5ZZlo= github.com/moby/sys/user v0.1.0 h1:WmZ93f5Ux6het5iituh9x2zAG7NFY9Aqi49jjE1PaQg= github.com/moby/sys/user v0.1.0/go.mod h1:fKJhFOnsCN6xZ5gSfbM6zaHGgDJMrqt9/reuj4T7MmU= +github.com/moby/sys/userns v0.1.0 h1:tVLXkFOxVu9A64/yh59slHVv9ahO9UIev4JZusOLG/g= +github.com/moby/sys/userns v0.1.0/go.mod h1:IHUYgu/kao6N8YZlp9Cf444ySSvCmDlmzUcYfDHOl28= github.com/moby/term v0.5.0 h1:xt8Q1nalod/v7BqbG21f8mQPqH+xAaC9C3N3wfWbVP0= github.com/moby/term v0.5.0/go.mod h1:8FzsFHVUBGZdbDsJw/ot+X+d5HLUbvklYLJ9uGfcI3Y= github.com/montanaflynn/stats v0.7.1 h1:etflOAAHORrCC44V+aR6Ftzort912ZU+YLiSTuV8eaE= From c23e60383bce1628245636d4fc557b70843de342 Mon Sep 17 00:00:00 2001 From: Yi Hu Date: Wed, 14 Aug 2024 05:34:26 -0400 Subject: [PATCH 75/78] Add Lineage metrics to Python BigQueryIO (#32116) * Add Lineage metrics to Python BigQueryIO * Introduce metric.Lineage StringSet wrapper Reflect Java SDK #32090 * Direct Read * Export Read * ReadAllFromBigQuery * FILE_LOAD Write * fix lint; add tests * Consistent metrics name * Update sdks/python/apache_beam/metrics/metric.py Co-authored-by: Danny McCormick --------- Co-authored-by: Danny McCormick --- sdks/python/apache_beam/io/gcp/bigquery.py | 11 +++ .../apache_beam/io/gcp/bigquery_file_loads.py | 12 +++ .../io/gcp/bigquery_file_loads_test.py | 10 +++ .../io/gcp/bigquery_read_internal.py | 7 ++ .../io/gcp/bigquery_schema_tools_test.py | 18 ++-- .../apache_beam/io/gcp/bigquery_test.py | 43 ++++++++++ sdks/python/apache_beam/metrics/metric.py | 83 ++++++++++++++++++- .../python/apache_beam/metrics/metric_test.py | 22 +++++ 8 files changed, 192 insertions(+), 14 deletions(-) diff --git a/sdks/python/apache_beam/io/gcp/bigquery.py b/sdks/python/apache_beam/io/gcp/bigquery.py index e1c509d0e490c..b897df2d32ab3 100644 --- a/sdks/python/apache_beam/io/gcp/bigquery.py +++ b/sdks/python/apache_beam/io/gcp/bigquery.py @@ -400,6 +400,7 @@ def chain_after(result): from apache_beam.io.iobase import SourceBundle from apache_beam.io.textio import _TextSource as TextSource from apache_beam.metrics import Metrics +from apache_beam.metrics.metric import Lineage from apache_beam.options import value_provider as vp from apache_beam.options.pipeline_options import DebugOptions from apache_beam.options.pipeline_options import GoogleCloudOptions @@ -809,6 +810,11 @@ def split(self, desired_bundle_size, start_position=None, stop_position=None): self.table_reference.get(), project=self._get_project()) elif not self.table_reference.projectId: self.table_reference.projectId = self._get_project() + Lineage.sources().add( + 'bigquery', + self.table_reference.projectId, + self.table_reference.datasetId, + self.table_reference.tableId) schema, metadata_list = self._export_files(bq) self.export_result = _BigQueryExportResult( @@ -1157,6 +1163,11 @@ def split(self, desired_bundle_size, start_position=None, stop_position=None): self.table_reference.projectId, self.table_reference.datasetId, self.table_reference.tableId) + Lineage.sources().add( + "bigquery", + self.table_reference.projectId, + self.table_reference.datasetId, + self.table_reference.tableId) if self.use_native_datetime: requested_session.data_format = bq_storage.types.DataFormat.ARROW diff --git a/sdks/python/apache_beam/io/gcp/bigquery_file_loads.py b/sdks/python/apache_beam/io/gcp/bigquery_file_loads.py index 3203c21a8e64a..a7311ad6d0637 100644 --- a/sdks/python/apache_beam/io/gcp/bigquery_file_loads.py +++ b/sdks/python/apache_beam/io/gcp/bigquery_file_loads.py @@ -40,6 +40,7 @@ from apache_beam.io import filesystems as fs from apache_beam.io.gcp import bigquery_tools from apache_beam.io.gcp.bigquery_io_metadata import create_bigquery_io_metadata +from apache_beam.metrics.metric import Lineage from apache_beam.options import value_provider as vp from apache_beam.options.pipeline_options import GoogleCloudOptions from apache_beam.transforms import trigger @@ -564,6 +565,11 @@ def process_one(self, element, job_name_prefix): write_disposition = self.write_disposition wait_for_job = True self._observed_tables.add(copy_to_reference.tableId) + Lineage.sinks().add( + 'bigquery', + copy_to_reference.projectId, + copy_to_reference.datasetId, + copy_to_reference.tableId) else: wait_for_job = False write_disposition = 'WRITE_APPEND' @@ -735,6 +741,12 @@ def process( yield pvalue.TaggedOutput( TriggerLoadJobs.TEMP_TABLES, bigquery_tools.get_hashable_destination(table_reference)) + else: + Lineage.sinks().add( + 'bigquery', + table_reference.projectId, + table_reference.datasetId, + table_reference.tableId) _LOGGER.info( 'Triggering job %s to load data to BigQuery table %s.' diff --git a/sdks/python/apache_beam/io/gcp/bigquery_file_loads_test.py b/sdks/python/apache_beam/io/gcp/bigquery_file_loads_test.py index f27c7899f9f38..e4c0e34d9c1f7 100644 --- a/sdks/python/apache_beam/io/gcp/bigquery_file_loads_test.py +++ b/sdks/python/apache_beam/io/gcp/bigquery_file_loads_test.py @@ -42,6 +42,7 @@ from apache_beam.io.gcp.internal.clients import bigquery as bigquery_api from apache_beam.io.gcp.tests.bigquery_matcher import BigqueryFullResultMatcher from apache_beam.io.gcp.tests.bigquery_matcher import BigqueryFullResultStreamingMatcher +from apache_beam.metrics.metric import Lineage from apache_beam.options.pipeline_options import PipelineOptions from apache_beam.options.pipeline_options import StandardOptions from apache_beam.runners.dataflow.test_dataflow_runner import TestDataflowRunner @@ -510,6 +511,9 @@ def test_load_job_id_used(self): | "GetJobs" >> beam.Map(lambda x: x[1]) assert_that(jobs, equal_to([job_reference]), label='CheckJobProjectIds') + self.assertSetEqual( + Lineage.query(p.result.metrics(), Lineage.SINK), + set(["bigquery:project1.dataset1.table1"])) def test_load_job_id_use_for_copy_job(self): destination = 'project1:dataset1.table1' @@ -563,6 +567,9 @@ def test_load_job_id_use_for_copy_job(self): job_reference ]), label='CheckCopyJobProjectIds') + self.assertSetEqual( + Lineage.query(p.result.metrics(), Lineage.SINK), + set(["bigquery:project1.dataset1.table1"])) @mock.patch('time.sleep') def test_wait_for_load_job_completion(self, sleep_mock): @@ -725,6 +732,9 @@ def test_multiple_partition_files(self): copy_jobs | "CountCopyJobs" >> combiners.Count.Globally(), equal_to([6]), label='CheckCopyJobCount') + self.assertSetEqual( + Lineage.query(p.result.metrics(), Lineage.SINK), + set(["bigquery:project1.dataset1.table1"])) @parameterized.expand([ param(write_disposition=BigQueryDisposition.WRITE_TRUNCATE), diff --git a/sdks/python/apache_beam/io/gcp/bigquery_read_internal.py b/sdks/python/apache_beam/io/gcp/bigquery_read_internal.py index f3881ed261ae3..f038b48e04d53 100644 --- a/sdks/python/apache_beam/io/gcp/bigquery_read_internal.py +++ b/sdks/python/apache_beam/io/gcp/bigquery_read_internal.py @@ -44,6 +44,7 @@ from apache_beam.io.gcp.bigquery_io_metadata import create_bigquery_io_metadata from apache_beam.io.iobase import BoundedSource from apache_beam.io.textio import _TextSource +from apache_beam.metrics.metric import Lineage from apache_beam.options.pipeline_options import GoogleCloudOptions from apache_beam.options.pipeline_options import PipelineOptions from apache_beam.options.value_provider import ValueProvider @@ -261,6 +262,12 @@ def process(self, for metadata in metadata_list: yield self._create_source(metadata.path, schema) + Lineage.sources().add( + 'bigquery', + table_reference.projectId, + table_reference.datasetId, + table_reference.tableId) + if element.query is not None: self.bq._delete_table( table_reference.projectId, diff --git a/sdks/python/apache_beam/io/gcp/bigquery_schema_tools_test.py b/sdks/python/apache_beam/io/gcp/bigquery_schema_tools_test.py index 72697e29c4d56..7ae49dff205d7 100644 --- a/sdks/python/apache_beam/io/gcp/bigquery_schema_tools_test.py +++ b/sdks/python/apache_beam/io/gcp/bigquery_schema_tools_test.py @@ -136,12 +136,11 @@ def test_bad_schema_public_api_export(self, get_table): with self.assertRaisesRegex(ValueError, "Encountered an unsupported type: 'DOUBLE'"): p = apache_beam.Pipeline() - pipeline = p | apache_beam.io.gcp.bigquery.ReadFromBigQuery( + _ = p | apache_beam.io.gcp.bigquery.ReadFromBigQuery( table="dataset.sample_table", method="EXPORT", project="project", output_type='BEAM_ROW') - pipeline @mock.patch.object(BigQueryWrapper, 'get_table') def test_bad_schema_public_api_direct_read(self, get_table): @@ -159,21 +158,19 @@ def test_bad_schema_public_api_direct_read(self, get_table): with self.assertRaisesRegex(ValueError, "Encountered an unsupported type: 'DOUBLE'"): p = apache_beam.Pipeline() - pipeline = p | apache_beam.io.gcp.bigquery.ReadFromBigQuery( + _ = p | apache_beam.io.gcp.bigquery.ReadFromBigQuery( table="dataset.sample_table", method="DIRECT_READ", project="project", output_type='BEAM_ROW') - pipeline def test_unsupported_value_provider(self): with self.assertRaisesRegex(TypeError, 'ReadFromBigQuery: table must be of type string' '; got ValueProvider instead'): p = apache_beam.Pipeline() - pipeline = p | apache_beam.io.gcp.bigquery.ReadFromBigQuery( + _ = p | apache_beam.io.gcp.bigquery.ReadFromBigQuery( table=value_provider.ValueProvider(), output_type='BEAM_ROW') - pipeline def test_unsupported_callable(self): def filterTable(table): @@ -185,9 +182,8 @@ def filterTable(table): 'ReadFromBigQuery: table must be of type string' '; got a callable instead'): p = apache_beam.Pipeline() - pipeline = p | apache_beam.io.gcp.bigquery.ReadFromBigQuery( + _ = p | apache_beam.io.gcp.bigquery.ReadFromBigQuery( table=res, output_type='BEAM_ROW') - pipeline def test_unsupported_query_export(self): with self.assertRaisesRegex( @@ -195,12 +191,11 @@ def test_unsupported_query_export(self): "Both a query and an output type of 'BEAM_ROW' were specified. " "'BEAM_ROW' is not currently supported with queries."): p = apache_beam.Pipeline() - pipeline = p | apache_beam.io.gcp.bigquery.ReadFromBigQuery( + _ = p | apache_beam.io.gcp.bigquery.ReadFromBigQuery( table="project:dataset.sample_table", method="EXPORT", query='SELECT name FROM dataset.sample_table', output_type='BEAM_ROW') - pipeline def test_unsupported_query_direct_read(self): with self.assertRaisesRegex( @@ -208,12 +203,11 @@ def test_unsupported_query_direct_read(self): "Both a query and an output type of 'BEAM_ROW' were specified. " "'BEAM_ROW' is not currently supported with queries."): p = apache_beam.Pipeline() - pipeline = p | apache_beam.io.gcp.bigquery.ReadFromBigQuery( + _ = p | apache_beam.io.gcp.bigquery.ReadFromBigQuery( table="project:dataset.sample_table", method="DIRECT_READ", query='SELECT name FROM dataset.sample_table', output_type='BEAM_ROW') - pipeline if __name__ == '__main__': logging.getLogger().setLevel(logging.INFO) diff --git a/sdks/python/apache_beam/io/gcp/bigquery_test.py b/sdks/python/apache_beam/io/gcp/bigquery_test.py index e53204a5ebc6f..c263b636b57ad 100644 --- a/sdks/python/apache_beam/io/gcp/bigquery_test.py +++ b/sdks/python/apache_beam/io/gcp/bigquery_test.py @@ -50,6 +50,7 @@ from apache_beam.io.gcp.bigquery import TableRowJsonCoder from apache_beam.io.gcp.bigquery import WriteToBigQuery from apache_beam.io.gcp.bigquery import _StreamToBigQuery +from apache_beam.io.gcp.bigquery_read_internal import _BigQueryReadSplit from apache_beam.io.gcp.bigquery_read_internal import _JsonToDictCoder from apache_beam.io.gcp.bigquery_read_internal import bigquery_export_destination_uri from apache_beam.io.gcp.bigquery_tools import JSON_COMPLIANCE_ERROR @@ -61,6 +62,7 @@ from apache_beam.io.gcp.tests.bigquery_matcher import BigqueryFullResultMatcher from apache_beam.io.gcp.tests.bigquery_matcher import BigqueryFullResultStreamingMatcher from apache_beam.io.gcp.tests.bigquery_matcher import BigQueryTableMatcher +from apache_beam.metrics.metric import Lineage from apache_beam.options import value_provider from apache_beam.options.pipeline_options import PipelineOptions from apache_beam.options.pipeline_options import StandardOptions @@ -85,9 +87,11 @@ from apitools.base.py.exceptions import HttpError from apitools.base.py.exceptions import HttpForbiddenError from google.cloud import bigquery as gcp_bigquery + from google.cloud import bigquery_storage_v1 as bq_storage from google.api_core import exceptions except ImportError: gcp_bigquery = None + bq_storage = None HttpError = None HttpForbiddenError = None exceptions = None @@ -460,6 +464,8 @@ def test_create_temp_dataset_exception(self, exception_type, error_message): self.assertIn(error_message, exc.exception.args[0]) @parameterized.expand([ + # read without exception + param(responses=[], expected_retries=0), # first attempt returns a Http 500 blank error and retries # second attempt returns a Http 408 blank error and retries, # third attempt passes @@ -540,6 +546,9 @@ def store_callback(unused_request): # metadata (numBytes), and once to retrieve the table's schema # Any additional calls are retries self.assertEqual(expected_retries, mock_get_table.call_count - 2) + self.assertSetEqual( + Lineage.query(p.result.metrics(), Lineage.SOURCE), + set(["bigquery:project.dataset.table"])) @parameterized.expand([ # first attempt returns a Http 429 with transient reason and retries @@ -719,6 +728,40 @@ def test_read_export_exception(self, exception_type, error_message): mock_query_job.assert_called() self.assertIn(error_message, exc.exception.args[0]) + def test_read_direct_lineage(self): + with mock.patch.object(bigquery_tools.BigQueryWrapper, + '_bigquery_client'),\ + mock.patch.object(bq_storage.BigQueryReadClient, + 'create_read_session'),\ + beam.Pipeline() as p: + + _ = p | ReadFromBigQuery( + method=ReadFromBigQuery.Method.DIRECT_READ, + table='project:dataset.table') + self.assertSetEqual( + Lineage.query(p.result.metrics(), Lineage.SOURCE), + set(["bigquery:project.dataset.table"])) + + def test_read_all_lineage(self): + with mock.patch.object(_BigQueryReadSplit, '_export_files') as export, \ + beam.Pipeline() as p: + + export.return_value = (None, []) + + _ = ( + p + | beam.Create([ + beam.io.ReadFromBigQueryRequest(table='project1:dataset1.table1'), + beam.io.ReadFromBigQueryRequest(table='project2:dataset2.table2') + ]) + | beam.io.ReadAllFromBigQuery(gcs_location='gs://bucket/tmp')) + self.assertSetEqual( + Lineage.query(p.result.metrics(), Lineage.SOURCE), + set([ + 'bigquery:project1.dataset1.table1', + 'bigquery:project2.dataset2.table2' + ])) + @unittest.skipIf(HttpError is None, 'GCP dependencies are not installed') class TestBigQuerySink(unittest.TestCase): diff --git a/sdks/python/apache_beam/metrics/metric.py b/sdks/python/apache_beam/metrics/metric.py index 77cafb8bd64b7..6b8e4754a79ce 100644 --- a/sdks/python/apache_beam/metrics/metric.py +++ b/sdks/python/apache_beam/metrics/metric.py @@ -28,6 +28,7 @@ # mypy: disallow-untyped-defs import logging +import re from typing import TYPE_CHECKING from typing import Dict from typing import FrozenSet @@ -39,6 +40,7 @@ from typing import Union from apache_beam.metrics import cells +from apache_beam.metrics.execution import MetricResult from apache_beam.metrics.execution import MetricUpdater from apache_beam.metrics.metricbase import Counter from apache_beam.metrics.metricbase import Distribution @@ -50,7 +52,7 @@ from apache_beam.metrics.execution import MetricKey from apache_beam.metrics.metricbase import Metric -__all__ = ['Metrics', 'MetricsFilter'] +__all__ = ['Metrics', 'MetricsFilter', 'Lineage'] _LOGGER = logging.getLogger(__name__) @@ -223,7 +225,7 @@ def matches( def query( self, filter: Optional['MetricsFilter'] = None - ) -> Dict[str, List['MetricResults']]: + ) -> Dict[str, List['MetricResult']]: """Queries the runner for existing user metrics that match the filter. It should return a dictionary, with lists of each kind of metric, and @@ -305,3 +307,80 @@ def with_steps(self, steps: Iterable[str]) -> 'MetricsFilter': self._steps.update(steps) return self + + +class Lineage: + """Standard collection of metrics used to record source and sinks information + for lineage tracking.""" + + LINEAGE_NAMESPACE = "lineage" + SOURCE = "sources" + SINK = "sinks" + + _METRICS = { + SOURCE: Metrics.string_set(LINEAGE_NAMESPACE, SOURCE), + SINK: Metrics.string_set(LINEAGE_NAMESPACE, SINK) + } + + def __init__(self, label: str) -> None: + """Create a Lineage with valid label (:data:`~Lineage.SOURCE` or + :data:`~Lineage.SINK`) + """ + self.metric = Lineage._METRICS[label] + + @classmethod + def sources(cls) -> 'Lineage': + return cls(Lineage.SOURCE) + + @classmethod + def sinks(cls) -> 'Lineage': + return cls(Lineage.SINK) + + _RESERVED_CHARS = re.compile(r'[:\s.]') + + @staticmethod + def wrap_segment(segment: str) -> str: + """Wrap segment to valid segment name. + + Specifically, If there are reserved chars (colon, whitespace, dot), escape + with backtick. If the segment is already wrapped, return the original. + """ + if segment.startswith("`") and segment.endswith("`"): return segment + if Lineage._RESERVED_CHARS.search(segment): + return "`" + segment + "`" + return segment + + @staticmethod + def get_fq_name( + system: str, *segments: str, route: Optional[str] = None) -> str: + """Assemble fully qualified name + (`FQN `_). + Format: + + - `system:segment1.segment2` + - `system:routine:segment1.segment2` + - `system:`segment1.with.dots:clons`.segment2` + + This helper method is for internal and testing usage only. + """ + segs = '.'.join(map(Lineage.wrap_segment, segments)) + if route: + return ':'.join((system, route, segs)) + return ':'.join((system, segs)) + + def add( + self, system: str, *segments: str, route: Optional[str] = None) -> None: + self.metric.add(self.get_fq_name(system, *segments, route=route)) + + @staticmethod + def query(results: MetricResults, label: str) -> Set[str]: + if not label in Lineage._METRICS: + raise ValueError("Label {} does not exist for Lineage", label) + response = results.query( + MetricsFilter().with_namespace(Lineage.LINEAGE_NAMESPACE).with_name( + label))[MetricResults.STRINGSETS] + result = set() + for metric in response: + result.update(metric.committed) + result.update(metric.attempted) + return result diff --git a/sdks/python/apache_beam/metrics/metric_test.py b/sdks/python/apache_beam/metrics/metric_test.py index e3701228feecd..3a8da021101e5 100644 --- a/sdks/python/apache_beam/metrics/metric_test.py +++ b/sdks/python/apache_beam/metrics/metric_test.py @@ -28,6 +28,7 @@ from apache_beam.metrics.execution import MetricKey from apache_beam.metrics.execution import MetricsContainer from apache_beam.metrics.execution import MetricsEnvironment +from apache_beam.metrics.metric import Lineage from apache_beam.metrics.metric import MetricResults from apache_beam.metrics.metric import Metrics from apache_beam.metrics.metric import MetricsFilter @@ -248,5 +249,26 @@ def test_create_counter_distribution(self): sampler.stop() +class LineageTest(unittest.TestCase): + def test_fq_name(self): + test_cases = { + "apache-beam": "apache-beam", + "`apache-beam`": "`apache-beam`", + "apache.beam": "`apache.beam`", + "apache:beam": "`apache:beam`", + "apache beam": "`apache beam`", + "`apache beam`": "`apache beam`", + "apache\tbeam": "`apache\tbeam`", + "apache\nbeam": "`apache\nbeam`" + } + for k, v in test_cases.items(): + self.assertEqual("apache:" + v, Lineage.get_fq_name("apache", k)) + self.assertEqual( + "apache:beam:" + v, Lineage.get_fq_name("apache", k, route="beam")) + self.assertEqual( + "apache:beam:" + v + '.' + v, + Lineage.get_fq_name("apache", k, k, route="beam")) + + if __name__ == '__main__': unittest.main() From 8fbad48568833d60a5244d00f4b4b943d82bac0b Mon Sep 17 00:00:00 2001 From: scwhittle Date: Wed, 14 Aug 2024 15:26:12 +0200 Subject: [PATCH 76/78] Change FnApiDoFnRunner to skip trySplit checkpoint requests if not draining and nothing has yet been claimed by the tracker. (#32044) --- .../beam/fn/harness/FnApiDoFnRunner.java | 57 ++- .../beam/fn/harness/FnApiDoFnRunnerTest.java | 465 ++++++++++++++++-- 2 files changed, 485 insertions(+), 37 deletions(-) diff --git a/sdks/java/harness/src/main/java/org/apache/beam/fn/harness/FnApiDoFnRunner.java b/sdks/java/harness/src/main/java/org/apache/beam/fn/harness/FnApiDoFnRunner.java index f85622ab89fee..c39722c90d89e 100644 --- a/sdks/java/harness/src/main/java/org/apache/beam/fn/harness/FnApiDoFnRunner.java +++ b/sdks/java/harness/src/main/java/org/apache/beam/fn/harness/FnApiDoFnRunner.java @@ -34,6 +34,7 @@ import java.util.Map; import java.util.NavigableSet; import java.util.Set; +import java.util.concurrent.atomic.AtomicBoolean; import java.util.function.BiConsumer; import java.util.function.BiFunction; import java.util.function.Consumer; @@ -118,6 +119,7 @@ import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.ByteString; import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.util.Durations; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables; @@ -327,6 +329,11 @@ static class Factory currentTracker; + /** + * If non-null, set to true after currentTracker has had a tryClaim issued on it. Used to ignore + * checkpoint split requests if no progress was made. + */ + private @Nullable AtomicBoolean currentTrackerClaimed; /** * Only valid during {@link #processTimer} and {@link #processOnWindowExpiration}, null otherwise. @@ -877,12 +884,18 @@ private void processElementForSplitRestriction( currentElement = elem.withValue(elem.getValue().getKey()); currentRestriction = elem.getValue().getValue().getKey(); currentWatermarkEstimatorState = elem.getValue().getValue().getValue(); + currentTrackerClaimed = new AtomicBoolean(false); currentTracker = RestrictionTrackers.observe( doFnInvoker.invokeNewTracker(processContext), new ClaimObserver() { + private final AtomicBoolean claimed = + Preconditions.checkNotNull(currentTrackerClaimed); + @Override - public void onClaimed(PositionT position) {} + public void onClaimed(PositionT position) { + claimed.lazySet(true); + } @Override public void onClaimFailed(PositionT position) {} @@ -894,6 +907,7 @@ public void onClaimFailed(PositionT position) {} currentRestriction = null; currentWatermarkEstimatorState = null; currentTracker = null; + currentTrackerClaimed = null; } this.stateAccessor.finalizeState(); @@ -909,12 +923,18 @@ private void processElementForWindowObservingSplitRestriction( (Iterator) elem.getWindows().iterator(); while (windowIterator.hasNext()) { currentWindow = windowIterator.next(); + currentTrackerClaimed = new AtomicBoolean(false); currentTracker = RestrictionTrackers.observe( doFnInvoker.invokeNewTracker(processContext), new ClaimObserver() { + private final AtomicBoolean claimed = + Preconditions.checkNotNull(currentTrackerClaimed); + @Override - public void onClaimed(PositionT position) {} + public void onClaimed(PositionT position) { + claimed.lazySet(true); + } @Override public void onClaimFailed(PositionT position) {} @@ -927,6 +947,7 @@ public void onClaimFailed(PositionT position) {} currentWatermarkEstimatorState = null; currentWindow = null; currentTracker = null; + currentTrackerClaimed = null; } this.stateAccessor.finalizeState(); @@ -937,6 +958,8 @@ private void processElementForTruncateRestriction( currentElement = elem.withValue(elem.getValue().getKey().getKey()); currentRestriction = elem.getValue().getKey().getValue().getKey(); currentWatermarkEstimatorState = elem.getValue().getKey().getValue().getValue(); + // For truncation, we don't set currentTrackerClaimed so that we enable checkpointing even if no + // progress is made. currentTracker = RestrictionTrackers.observe( doFnInvoker.invokeNewTracker(processContext), @@ -989,6 +1012,8 @@ private void processElementForWindowObservingTruncateRestriction( currentRestriction = elem.getValue().getKey().getValue().getKey(); currentWatermarkEstimatorState = elem.getValue().getKey().getValue().getValue(); currentWindow = currentWindows.get(windowCurrentIndex); + // We leave currentTrackerClaimed unset as we want to split regardless of if tryClaim is + // called. currentTracker = RestrictionTrackers.observe( doFnInvoker.invokeNewTracker(processContext), @@ -1081,12 +1106,18 @@ private void processElementForWindowObservingSizedElementAndRestriction( currentRestriction = elem.getValue().getKey().getValue().getKey(); currentWatermarkEstimatorState = elem.getValue().getKey().getValue().getValue(); currentWindow = currentWindows.get(windowCurrentIndex); + currentTrackerClaimed = new AtomicBoolean(false); currentTracker = RestrictionTrackers.observe( doFnInvoker.invokeNewTracker(processContext), new ClaimObserver() { + private final AtomicBoolean claimed = + Preconditions.checkNotNull(currentTrackerClaimed); + @Override - public void onClaimed(PositionT position) {} + public void onClaimed(PositionT position) { + claimed.lazySet(true); + } @Override public void onClaimFailed(PositionT position) {} @@ -1107,7 +1138,7 @@ public void onClaimFailed(PositionT position) {} // Attempt to checkpoint the current restriction. HandlesSplits.SplitResult splitResult = - trySplitForElementAndRestriction(0, continuation.resumeDelay()); + trySplitForElementAndRestriction(0, continuation.resumeDelay(), false); /** * After the user has chosen to resume processing later, either the restriction is already @@ -1132,7 +1163,7 @@ private abstract class SplittableFnDataReceiver implements HandlesSplits, FnDataReceiver { @Override public HandlesSplits.SplitResult trySplit(double fractionOfRemainder) { - return trySplitForElementAndRestriction(fractionOfRemainder, Duration.ZERO); + return trySplitForElementAndRestriction(fractionOfRemainder, Duration.ZERO, true); } @Override @@ -1278,6 +1309,13 @@ private HandlesSplits.SplitResult trySplitForWindowObservingTruncateRestriction( if (currentWindow == null) { return null; } + // We are requesting a checkpoint but have not yet progressed on the restriction, skip + // request. + if (fractionOfRemainder == 0 + && currentTrackerClaimed != null + && !currentTrackerClaimed.get()) { + return null; + } SplitResultsWithStopIndex splitResult = computeSplitForProcessOrTruncate( @@ -1620,7 +1658,7 @@ static HandlesSplits.SplitResult constructSplitResult } private HandlesSplits.SplitResult trySplitForElementAndRestriction( - double fractionOfRemainder, Duration resumeDelay) { + double fractionOfRemainder, Duration resumeDelay, boolean requireClaimForCheckpoint) { KV watermarkAndState; WindowedSplitResult windowedSplitResult = null; synchronized (splitLock) { @@ -1628,6 +1666,13 @@ private HandlesSplits.SplitResult trySplitForElementAndRestriction( if (currentTracker == null) { return null; } + // The tracker has not yet been claimed meaning that a checkpoint won't meaningfully advance. + if (fractionOfRemainder == 0 + && requireClaimForCheckpoint + && currentTrackerClaimed != null + && !currentTrackerClaimed.get()) { + return null; + } // Make sure to get the output watermark before we split to ensure that the lower bound // applies to the residual. watermarkAndState = currentWatermarkEstimator.getWatermarkAndState(); diff --git a/sdks/java/harness/src/test/java/org/apache/beam/fn/harness/FnApiDoFnRunnerTest.java b/sdks/java/harness/src/test/java/org/apache/beam/fn/harness/FnApiDoFnRunnerTest.java index 11f25ab0116ef..f4d555dabcc18 100644 --- a/sdks/java/harness/src/test/java/org/apache/beam/fn/harness/FnApiDoFnRunnerTest.java +++ b/sdks/java/harness/src/test/java/org/apache/beam/fn/harness/FnApiDoFnRunnerTest.java @@ -24,6 +24,7 @@ import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkArgument; import static org.hamcrest.MatcherAssert.assertThat; import static org.hamcrest.Matchers.allOf; +import static org.hamcrest.Matchers.anEmptyMap; import static org.hamcrest.Matchers.contains; import static org.hamcrest.Matchers.containsInAnyOrder; import static org.hamcrest.Matchers.containsString; @@ -53,6 +54,7 @@ import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; import java.util.function.Supplier; import org.apache.beam.fn.harness.FnApiDoFnRunner.SplitResultsWithStopIndex; import org.apache.beam.fn.harness.FnApiDoFnRunner.WindowedSplitResult; @@ -151,6 +153,7 @@ import org.joda.time.Duration; import org.joda.time.Instant; import org.joda.time.format.PeriodFormat; +import org.junit.Assert; import org.junit.Before; import org.junit.Ignore; import org.junit.Rule; @@ -1370,41 +1373,83 @@ public void testRegistration() { *

  • splitting thread: {@link * NonWindowObservingTestSplittableDoFn#waitForSplitElementToBeProcessed()} *
  • process element thread: {@link - * NonWindowObservingTestSplittableDoFn#enableAndWaitForTrySplitToHappen()} + * NonWindowObservingTestSplittableDoFn#splitElementProcessed()} *
  • splitting thread: perform try split - *
  • splitting thread: {@link - * NonWindowObservingTestSplittableDoFn#releaseWaitingProcessElementThread()} + *
  • splitting thread: {@link NonWindowObservingTestSplittableDoFn#trySplitPerformed()} * + *
  • process element thread: {@link + * NonWindowObservingTestSplittableDoFn#waitForTrySplitPerformed()} * */ static class NonWindowObservingTestSplittableDoFn extends DoFn { - private static final ConcurrentMap> - DOFN_INSTANCE_TO_LOCK = new ConcurrentHashMap<>(); + private static final ConcurrentMap DOFN_INSTANCE_TO_LATCHES = + new ConcurrentHashMap<>(); private static final long SPLIT_ELEMENT = 3; private static final long CHECKPOINT_UPPER_BOUND = 8; - private KV getLatches() { - return DOFN_INSTANCE_TO_LOCK.computeIfAbsent( - this.uuid, (uuid) -> KV.of(new CountDownLatch(1), new CountDownLatch(1))); + static class Latches { + public Latches() {} + + CountDownLatch blockProcessLatch = new CountDownLatch(0); + CountDownLatch processEnteredLatch = new CountDownLatch(1); + CountDownLatch splitElementProcessedLatch = new CountDownLatch(1); + CountDownLatch trySplitPerformedLatch = new CountDownLatch(1); + AtomicBoolean abortProcessing = new AtomicBoolean(); + } + + private Latches getLatches() { + return DOFN_INSTANCE_TO_LATCHES.computeIfAbsent(this.uuid, (uuid) -> new Latches()); + } + + public void splitElementProcessed() { + getLatches().splitElementProcessedLatch.countDown(); } - public void enableAndWaitForTrySplitToHappen() throws Exception { - KV latches = getLatches(); - latches.getKey().countDown(); - if (!latches.getValue().await(30, TimeUnit.SECONDS)) { + public void waitForSplitElementToBeProcessed() throws InterruptedException { + if (!getLatches().splitElementProcessedLatch.await(30, TimeUnit.SECONDS)) { fail("Failed to wait for trySplit to occur."); } } - public void waitForSplitElementToBeProcessed() throws Exception { - KV latches = getLatches(); - if (!latches.getKey().await(30, TimeUnit.SECONDS)) { - fail("Failed to wait for split element to be processed."); + public void trySplitPerformed() { + getLatches().trySplitPerformedLatch.countDown(); + } + + public void waitForTrySplitPerformed() throws InterruptedException { + if (!getLatches().trySplitPerformedLatch.await(30, TimeUnit.SECONDS)) { + fail("Failed to wait for trySplit to occur."); } } - public void releaseWaitingProcessElementThread() { - KV latches = getLatches(); - latches.getValue().countDown(); + // Must be called before process is invoked. Will prevent process from doing anything until + // unblockProcess is + // called. + public void setupBlockProcess() { + getLatches().blockProcessLatch = new CountDownLatch(1); + } + + public void enterProcessAndBlockIfEnabled() throws InterruptedException { + getLatches().processEnteredLatch.countDown(); + if (!getLatches().blockProcessLatch.await(30, TimeUnit.SECONDS)) { + fail("Failed to wait for unblockProcess to occur."); + } + } + + public void waitForProcessEntered() throws InterruptedException { + if (!getLatches().processEnteredLatch.await(5, TimeUnit.SECONDS)) { + fail("Failed to wait for process to begin."); + } + } + + public void unblockProcess() throws InterruptedException { + getLatches().blockProcessLatch.countDown(); + } + + public void setAbortProcessing() { + getLatches().abortProcessing.set(true); + } + + public boolean shouldAbortProcessing() { + return getLatches().abortProcessing.get(); } private final String uuid; @@ -1421,13 +1466,14 @@ public ProcessContinuation processElement( throws Exception { long checkpointUpperBound = CHECKPOINT_UPPER_BOUND; long position = tracker.currentRestriction().getFrom(); - boolean claimStatus; - while (true) { + boolean claimStatus = true; + while (!shouldAbortProcessing()) { claimStatus = tracker.tryClaim(position); if (!claimStatus) { break; } else if (position == SPLIT_ELEMENT) { - enableAndWaitForTrySplitToHappen(); + splitElementProcessed(); + waitForTrySplitPerformed(); } context.outputWithTimestamp( context.element() + ":" + position, @@ -1511,15 +1557,17 @@ public ProcessContinuation processElement( RestrictionTracker tracker, ManualWatermarkEstimator watermarkEstimator) throws Exception { + enterProcessAndBlockIfEnabled(); long checkpointUpperBound = Long.parseLong(context.sideInput(singletonSideInput)); long position = tracker.currentRestriction().getFrom(); - boolean claimStatus; - while (true) { + boolean claimStatus = true; + while (!shouldAbortProcessing()) { claimStatus = tracker.tryClaim(position); if (!claimStatus) { break; } else if (position == NonWindowObservingTestSplittableDoFn.SPLIT_ELEMENT) { - enableAndWaitForTrySplitToHappen(); + splitElementProcessed(); + waitForTrySplitPerformed(); } context.outputWithTimestamp( context.element() + ":" + position, @@ -1549,7 +1597,8 @@ public TruncateResult truncateRestriction(@Restriction OffsetRange throws Exception { // Waiting for split when we are on the second window. if (splitAtTruncate && processedWindowCount == PROCESSED_WINDOW) { - enableAndWaitForTrySplitToHappen(); + splitElementProcessed(); + waitForTrySplitPerformed(); } processedWindowCount += 1; return TruncateResult.of(new OffsetRange(range.getFrom(), range.getTo() / 2)); @@ -1755,7 +1804,217 @@ public void testProcessElementForSizedElementAndRestriction() throws Exception { return ((HandlesSplits) mainInput).trySplit(0); } finally { - doFn.releaseWaitingProcessElementThread(); + doFn.trySplitPerformed(); + } + }); + + // Check that before processing an element we don't report progress + assertNoReportedProgress(context.getBundleProgressReporters()); + mainInput.accept( + valueInGlobalWindow( + KV.of( + KV.of("7", KV.of(new OffsetRange(0, 5), GlobalWindow.TIMESTAMP_MIN_VALUE)), + 2.0))); + HandlesSplits.SplitResult trySplitResult = trySplitFuture.get(); + + // Check that after processing an element we don't report progress + assertNoReportedProgress(context.getBundleProgressReporters()); + + // Since the SPLIT_ELEMENT is 3 we will process 0, 1, 2, 3 then be split. + // We expect that the watermark advances to MIN + 2 since the manual watermark estimator + // has yet to be invoked for the split element and that the primary represents [0, 4) with + // the original watermark while the residual represents [4, 5) with the new MIN + 2 + // watermark. + assertThat( + mainOutputValues, + contains( + timestampedValueInGlobalWindow( + "7:0", GlobalWindow.TIMESTAMP_MIN_VALUE.plus(Duration.millis(0))), + timestampedValueInGlobalWindow( + "7:1", GlobalWindow.TIMESTAMP_MIN_VALUE.plus(Duration.millis(1))), + timestampedValueInGlobalWindow( + "7:2", GlobalWindow.TIMESTAMP_MIN_VALUE.plus(Duration.millis(2))), + timestampedValueInGlobalWindow( + "7:3", GlobalWindow.TIMESTAMP_MIN_VALUE.plus(Duration.millis(3))))); + + BundleApplication primaryRoot = Iterables.getOnlyElement(trySplitResult.getPrimaryRoots()); + DelayedBundleApplication residualRoot = + Iterables.getOnlyElement(trySplitResult.getResidualRoots()); + assertEquals(ParDoTranslation.getMainInputName(pTransform), primaryRoot.getInputId()); + assertEquals(TEST_TRANSFORM_ID, primaryRoot.getTransformId()); + assertEquals( + ParDoTranslation.getMainInputName(pTransform), + residualRoot.getApplication().getInputId()); + assertEquals(TEST_TRANSFORM_ID, residualRoot.getApplication().getTransformId()); + assertEquals( + valueInGlobalWindow( + KV.of( + KV.of("7", KV.of(new OffsetRange(0, 4), GlobalWindow.TIMESTAMP_MIN_VALUE)), + 4.0)), + inputCoder.decode(primaryRoot.getElement().newInput())); + assertEquals( + valueInGlobalWindow( + KV.of( + KV.of( + "7", + KV.of( + new OffsetRange(4, 5), + GlobalWindow.TIMESTAMP_MIN_VALUE.plus(Duration.millis(2)))), + 1.0)), + inputCoder.decode(residualRoot.getApplication().getElement().newInput())); + Instant expectedOutputWatermark = GlobalWindow.TIMESTAMP_MIN_VALUE.plus(Duration.millis(2)); + assertEquals( + ImmutableMap.of( + "output", + org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.Timestamp.newBuilder() + .setSeconds(expectedOutputWatermark.getMillis() / 1000) + .setNanos((int) (expectedOutputWatermark.getMillis() % 1000) * 1000000) + .build()), + residualRoot.getApplication().getOutputWatermarksMap()); + // We expect 0 resume delay. + assertEquals( + residualRoot.getRequestedTimeDelay().getDefaultInstanceForType(), + residualRoot.getRequestedTimeDelay()); + // We don't expect the outputs to goto the SDK initiated checkpointing listener. + assertTrue(splitListener.getPrimaryRoots().isEmpty()); + assertTrue(splitListener.getResidualRoots().isEmpty()); + mainOutputValues.clear(); + executorService.shutdown(); + } + + Iterables.getOnlyElement(context.getFinishBundleFunctions()).run(); + assertThat(mainOutputValues, empty()); + + Iterables.getOnlyElement(context.getTearDownFunctions()).run(); + assertThat(mainOutputValues, empty()); + + // Assert that state data did not change + assertEquals( + new FakeBeamFnStateClient(StringUtf8Coder.of(), stateData).getData(), + fakeClient.getData()); + } + + @Test + public void testProcessElementForSizedElementAndRestrictionSplitBeforeTryClaim() + throws Exception { + Pipeline p = Pipeline.create(); + addExperiment(p.getOptions().as(ExperimentalOptions.class), "beam_fn_api"); + // TODO(BEAM-10097): Remove experiment once all portable runners support this view type + addExperiment(p.getOptions().as(ExperimentalOptions.class), "use_runner_v2"); + PCollection valuePCollection = p.apply(Create.of("unused")); + PCollectionView singletonSideInputView = valuePCollection.apply(View.asSingleton()); + WindowObservingTestSplittableDoFn doFn = + new WindowObservingTestSplittableDoFn(singletonSideInputView); + valuePCollection.apply( + TEST_TRANSFORM_ID, ParDo.of(doFn).withSideInputs(singletonSideInputView)); + + RunnerApi.Pipeline pProto = + ProtoOverrides.updateTransform( + PTransformTranslation.PAR_DO_TRANSFORM_URN, + PipelineTranslation.toProto(p, SdkComponents.create(p.getOptions()), true), + SplittableParDoExpander.createSizedReplacement()); + String expandedTransformId = + Iterables.find( + pProto.getComponents().getTransformsMap().entrySet(), + entry -> + entry + .getValue() + .getSpec() + .getUrn() + .equals( + PTransformTranslation + .SPLITTABLE_PROCESS_SIZED_ELEMENTS_AND_RESTRICTIONS_URN) + && entry.getValue().getUniqueName().contains(TEST_TRANSFORM_ID)) + .getKey(); + RunnerApi.PTransform pTransform = + pProto.getComponents().getTransformsOrThrow(expandedTransformId); + String inputPCollectionId = + pTransform.getInputsOrThrow(ParDoTranslation.getMainInputName(pTransform)); + RunnerApi.PCollection inputPCollection = + pProto.getComponents().getPcollectionsOrThrow(inputPCollectionId); + RehydratedComponents rehydratedComponents = + RehydratedComponents.forComponents(pProto.getComponents()); + Coder inputCoder = + WindowedValue.getFullCoder( + CoderTranslation.fromProto( + pProto.getComponents().getCodersOrThrow(inputPCollection.getCoderId()), + rehydratedComponents, + TranslationContext.DEFAULT), + (Coder) + CoderTranslation.fromProto( + pProto + .getComponents() + .getCodersOrThrow( + pProto + .getComponents() + .getWindowingStrategiesOrThrow( + inputPCollection.getWindowingStrategyId()) + .getWindowCoderId()), + rehydratedComponents, + TranslationContext.DEFAULT)); + String outputPCollectionId = pTransform.getOutputsOrThrow("output"); + + ImmutableMap> stateData = + ImmutableMap.of( + iterableSideInputKey( + singletonSideInputView.getTagInternal().getId(), ByteString.EMPTY), + asList("8")); + + FakeBeamFnStateClient fakeClient = new FakeBeamFnStateClient(StringUtf8Coder.of(), stateData); + + BundleSplitListener.InMemory splitListener = BundleSplitListener.InMemory.create(); + + PTransformRunnerFactoryTestContext context = + PTransformRunnerFactoryTestContext.builder(TEST_TRANSFORM_ID, pTransform) + .beamFnStateClient(fakeClient) + .processBundleInstructionId("57") + .pCollections(pProto.getComponentsOrBuilder().getPcollectionsMap()) + .coders(pProto.getComponents().getCodersMap()) + .windowingStrategies(pProto.getComponents().getWindowingStrategiesMap()) + .splitListener(splitListener) + .build(); + List> mainOutputValues = new ArrayList<>(); + context.addPCollectionConsumer( + outputPCollectionId, + (FnDataReceiver) (FnDataReceiver>) mainOutputValues::add); + + new FnApiDoFnRunner.Factory<>().createRunnerForPTransform(context); + + Iterables.getOnlyElement(context.getStartBundleFunctions()).run(); + mainOutputValues.clear(); + + assertThat( + context.getPCollectionConsumers().keySet(), + containsInAnyOrder(inputPCollectionId, outputPCollectionId)); + + FnDataReceiver> mainInput = + context.getPCollectionConsumer(inputPCollectionId); + assertThat(mainInput, instanceOf(HandlesSplits.class)); + + doFn.setupBlockProcess(); + { + // Setup and launch the trySplit thread. + ExecutorService executorService = Executors.newSingleThreadExecutor(); + Future trySplitFuture = + executorService.submit( + () -> { + try { + // Verify that a split before anything is claimed is ignored. + doFn.waitForProcessEntered(); + Assert.assertNull(((HandlesSplits) mainInput).trySplit(0)); + doFn.unblockProcess(); + + doFn.waitForSplitElementToBeProcessed(); + // Currently processing "3" out of range [0, 5) elements. + assertEquals(0.6, ((HandlesSplits) mainInput).getProgress(), 0.01); + + // Check that during progressing of an element we report progress + assertReportedProgressEquals( + context.getShortIdMap(), context.getBundleProgressReporters(), 3.0, 2.0); + + return ((HandlesSplits) mainInput).trySplit(0); + } finally { + doFn.trySplitPerformed(); } }); @@ -1845,6 +2104,149 @@ public void testProcessElementForSizedElementAndRestriction() throws Exception { fakeClient.getData()); } + @Test + public void testProcessElementForSizedElementAndRestrictionNoTryClaim() throws Exception { + Pipeline p = Pipeline.create(); + addExperiment(p.getOptions().as(ExperimentalOptions.class), "beam_fn_api"); + // TODO(BEAM-10097): Remove experiment once all portable runners support this view type + addExperiment(p.getOptions().as(ExperimentalOptions.class), "use_runner_v2"); + PCollection valuePCollection = p.apply(Create.of("unused")); + PCollectionView singletonSideInputView = valuePCollection.apply(View.asSingleton()); + WindowObservingTestSplittableDoFn doFn = + new WindowObservingTestSplittableDoFn(singletonSideInputView); + doFn.setAbortProcessing(); + valuePCollection.apply( + TEST_TRANSFORM_ID, ParDo.of(doFn).withSideInputs(singletonSideInputView)); + + RunnerApi.Pipeline pProto = + ProtoOverrides.updateTransform( + PTransformTranslation.PAR_DO_TRANSFORM_URN, + PipelineTranslation.toProto(p, SdkComponents.create(p.getOptions()), true), + SplittableParDoExpander.createSizedReplacement()); + String expandedTransformId = + Iterables.find( + pProto.getComponents().getTransformsMap().entrySet(), + entry -> + entry + .getValue() + .getSpec() + .getUrn() + .equals( + PTransformTranslation + .SPLITTABLE_PROCESS_SIZED_ELEMENTS_AND_RESTRICTIONS_URN) + && entry.getValue().getUniqueName().contains(TEST_TRANSFORM_ID)) + .getKey(); + RunnerApi.PTransform pTransform = + pProto.getComponents().getTransformsOrThrow(expandedTransformId); + String inputPCollectionId = + pTransform.getInputsOrThrow(ParDoTranslation.getMainInputName(pTransform)); + RunnerApi.PCollection inputPCollection = + pProto.getComponents().getPcollectionsOrThrow(inputPCollectionId); + RehydratedComponents rehydratedComponents = + RehydratedComponents.forComponents(pProto.getComponents()); + Coder inputCoder = + WindowedValue.getFullCoder( + CoderTranslation.fromProto( + pProto.getComponents().getCodersOrThrow(inputPCollection.getCoderId()), + rehydratedComponents, + TranslationContext.DEFAULT), + (Coder) + CoderTranslation.fromProto( + pProto + .getComponents() + .getCodersOrThrow( + pProto + .getComponents() + .getWindowingStrategiesOrThrow( + inputPCollection.getWindowingStrategyId()) + .getWindowCoderId()), + rehydratedComponents, + TranslationContext.DEFAULT)); + String outputPCollectionId = pTransform.getOutputsOrThrow("output"); + + ImmutableMap> stateData = + ImmutableMap.of( + iterableSideInputKey( + singletonSideInputView.getTagInternal().getId(), ByteString.EMPTY), + asList("8")); + + FakeBeamFnStateClient fakeClient = new FakeBeamFnStateClient(StringUtf8Coder.of(), stateData); + + BundleSplitListener.InMemory splitListener = BundleSplitListener.InMemory.create(); + + PTransformRunnerFactoryTestContext context = + PTransformRunnerFactoryTestContext.builder(TEST_TRANSFORM_ID, pTransform) + .beamFnStateClient(fakeClient) + .processBundleInstructionId("57") + .pCollections(pProto.getComponentsOrBuilder().getPcollectionsMap()) + .coders(pProto.getComponents().getCodersMap()) + .windowingStrategies(pProto.getComponents().getWindowingStrategiesMap()) + .splitListener(splitListener) + .build(); + List> mainOutputValues = new ArrayList<>(); + context.addPCollectionConsumer( + outputPCollectionId, + (FnDataReceiver) (FnDataReceiver>) mainOutputValues::add); + + new FnApiDoFnRunner.Factory<>().createRunnerForPTransform(context); + + Iterables.getOnlyElement(context.getStartBundleFunctions()).run(); + mainOutputValues.clear(); + + assertThat( + context.getPCollectionConsumers().keySet(), + containsInAnyOrder(inputPCollectionId, outputPCollectionId)); + + FnDataReceiver> mainInput = + context.getPCollectionConsumer(inputPCollectionId); + assertThat(mainInput, instanceOf(HandlesSplits.class)); + + { + // Check that before processing an element we don't report progress + assertNoReportedProgress(context.getBundleProgressReporters()); + mainInput.accept( + valueInGlobalWindow( + KV.of( + KV.of("5", KV.of(new OffsetRange(5, 10), GlobalWindow.TIMESTAMP_MIN_VALUE)), + 5.0))); + // Check that after processing an element we don't report progress + assertNoReportedProgress(context.getBundleProgressReporters()); + + // Since we set abort processing above, we expect the input restriction to be output with a + // resume + // delay. + BundleApplication primaryRoot = Iterables.getOnlyElement(splitListener.getPrimaryRoots()); + DelayedBundleApplication residualRoot = + Iterables.getOnlyElement(splitListener.getResidualRoots()); + assertEquals(ParDoTranslation.getMainInputName(pTransform), primaryRoot.getInputId()); + assertEquals(TEST_TRANSFORM_ID, primaryRoot.getTransformId()); + assertEquals( + ParDoTranslation.getMainInputName(pTransform), + residualRoot.getApplication().getInputId()); + assertEquals(TEST_TRANSFORM_ID, residualRoot.getApplication().getTransformId()); + assertEquals( + valueInGlobalWindow( + KV.of( + KV.of("5", KV.of(new OffsetRange(5, 5), GlobalWindow.TIMESTAMP_MIN_VALUE)), + 0.0)), + inputCoder.decode(primaryRoot.getElement().newInput())); + assertEquals( + valueInGlobalWindow( + KV.of( + KV.of("5", KV.of(new OffsetRange(5, 10), GlobalWindow.TIMESTAMP_MIN_VALUE)), + 5.0)), + inputCoder.decode(residualRoot.getApplication().getElement().newInput())); + assertThat(residualRoot.getApplication().getOutputWatermarksMap(), anEmptyMap()); + assertEquals( + org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.Duration.newBuilder() + .setSeconds(54) + .setNanos(321000000) + .build(), + residualRoot.getRequestedTimeDelay()); + splitListener.clear(); + } + } + private static final MonitoringInfo WORK_COMPLETED_MI = MonitoringInfo.newBuilder() .setUrn(MonitoringInfoConstants.Urns.WORK_COMPLETED) @@ -2187,7 +2589,7 @@ public void testProcessElementForWindowedSizedElementAndRestriction() throws Exc return ((HandlesSplits) mainInput).trySplit(0); } finally { - doFn.releaseWaitingProcessElementThread(); + doFn.trySplitPerformed(); } }); @@ -3143,10 +3545,11 @@ public void testProcessElementForTruncateAndSizeRestrictionForwardSplitWhenObser () -> { try { doFn.waitForSplitElementToBeProcessed(); - - return ((HandlesSplits) mainInput).trySplit(0); + HandlesSplits.SplitResult result = ((HandlesSplits) mainInput).trySplit(0); + Assert.assertNotNull(result); + return result; } finally { - doFn.releaseWaitingProcessElementThread(); + doFn.trySplitPerformed(); } }); From dbd719ba1448c13bc97237d1b701e2978c0e29d4 Mon Sep 17 00:00:00 2001 From: Jack McCluskey <34928439+jrmccluskey@users.noreply.github.com> Date: Wed, 14 Aug 2024 09:50:17 -0400 Subject: [PATCH 77/78] [WIP] Gemma Sentiment and Summarization Example Notebook (#32172) --- .../gemma_2_sentiment_and_summarization.ipynb | 625 ++++++++++++++++++ 1 file changed, 625 insertions(+) create mode 100644 examples/notebooks/beam-ml/gemma_2_sentiment_and_summarization.ipynb diff --git a/examples/notebooks/beam-ml/gemma_2_sentiment_and_summarization.ipynb b/examples/notebooks/beam-ml/gemma_2_sentiment_and_summarization.ipynb new file mode 100644 index 0000000000000..b45d9d7aea959 --- /dev/null +++ b/examples/notebooks/beam-ml/gemma_2_sentiment_and_summarization.ipynb @@ -0,0 +1,625 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "BrKf6TQ98qIJ", + "metadata": { + "id": "BrKf6TQ98qIJ" + }, + "outputs": [], + "source": [ + "# @title ###### Licensed to the Apache Software Foundation (ASF), Version 2.0 (the \"License\")\n", + "\n", + "# Licensed to the Apache Software Foundation (ASF) under one\n", + "# or more contributor license agreements. See the NOTICE file\n", + "# distributed with this work for additional information\n", + "# regarding copyright ownership. The ASF licenses this file\n", + "# to you under the Apache License, Version 2.0 (the\n", + "# \"License\"); you may not use this file except in compliance\n", + "# with the License. You may obtain a copy of the License at\n", + "#\n", + "# http://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing,\n", + "# software distributed under the License is distributed on an\n", + "# \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY\n", + "# KIND, either express or implied. See the License for the\n", + "# specific language governing permissions and limitations\n", + "# under the License" + ] + }, + { + "cell_type": "markdown", + "id": "hHg4SoUr8qIK", + "metadata": { + "id": "hHg4SoUr8qIK" + }, + "source": [ + "# Use Gemma to gauge sentiment and summarize conversations\n", + "\n", + "\n", + " \n", + " \n", + "
    \n", + " Run in Google Colab\n", + " \n", + " View source on GitHub\n", + "
    " + ] + }, + { + "cell_type": "markdown", + "id": "yOs5SCyPdYNi", + "metadata": { + "id": "yOs5SCyPdYNi" + }, + "source": [ + "Gemma is a family of lightweight, state-of-the art open models built from research and technology used to create the Gemini models. You can use Gemma models in your Apache Beam inference pipelines.\n", + "\n", + "Because large language models (LLMs) like Gemma are versatile, you can integrate them into business processes. The example in this notebook demonstrates how to use Gemma to gauge the sentiment of a conversation, summarize that conversation's content, and draft a reply for a difficult conversation. The system allows a person to review the reply before it's sent to customers. For more information, see the blog post [Gemma for Streaming ML with Dataflow](https://developers.googleblog.com/en/gemma-for-streaming-ml-with-dataflow).\n", + "\n", + "A requirement of this work is that customers who express a negative sentiment receive a reply in near real-time. As a result, the workflow needs to use a streaming data pipeline with an LLM that has minimal latency.\n", + "\n", + "## Use case\n", + "\n", + "An example use case is a bustling food chain grappling with analyzing and storing a high volume of customer support requests. Customer interactions include both chats generated by automated chatbots and nuanced conversations that require the attention of live support staff.\n", + "\n", + "### Requirements\n", + "\n", + "To address both types of interactions, the workflow has the following requirements.\n", + "\n", + "- It needs to efficiently manage and store chat data by summarizing positive interactions for easy reference and future analysis.\n", + "\n", + "- It must use real-time issue detection and resolution.\n", + "\n", + "- Sentiment analysis must identify dissatisfied customers and generate tailored responses to address their concerns.\n", + "\n", + "### Workflow\n", + "\n", + "To meet these requirements, the pipeline processes completed chat messages in near real time. First, the pipeline uses Gemma to monitor the sentiment of the customer chats. All chats are then summarized, with positive or neutral sentiment chats sent directly to a data platform, BigQuery, by using the available Dataflow I/Os.\n", + "\n", + "For chats that have a negative sentiment, the Gemma model crafts a contextually appropriate response for the customer. This response is sent to a human for review so that they can refine the message before it reaches the customer.\n", + "\n", + "This example addresses important complexities inherent in using an LLM within a pipeline. For example, processing the responses in code is challenging because of the non-deterministic nature of the text. In this example, the workflow requires the LLM to generate JSON responses, which is not the default format. The worklow needs to parse and validate the response, a process similar to processing data from sources that don't always have correctly structured data.\n", + "\n", + "This workflow allows businesses to respond to customers faster and to provide personalized responses when needed.\n", + "\n", + "- The automation of positive chat summarization allows support staff to focus on more complex interactions.\n", + "- The scalability of the system makes it possible to adapt to increasing chat volumes without compromising response quality.\n", + "\n", + "You can also use the in-depth analysis of chat data to drive data-driven decision-making." + ] + }, + { + "cell_type": "markdown", + "id": "tGZfleinj3xM", + "metadata": { + "id": "tGZfleinj3xM" + }, + "source": [ + "## The data processing pipeline" + ] + }, + { + "cell_type": "markdown", + "id": "G-VpT7-FjcSu", + "metadata": { + "id": "G-VpT7-FjcSu" + }, + "source": [ + "![Screenshot 2024-08-08 at 11.15.41.png]()" + ] + }, + { + "cell_type": "markdown", + "id": "mXtb21lMj_rU", + "metadata": { + "id": "mXtb21lMj_rU" + }, + "source": [ + "At a high level, the pipeline has the following steps:\n", + "\n", + "1. Read the review data from Pub/Sub, the event messaging source. This data contains the chat ID and the chat history as a JSON payload. This payload is processed in the pipeline.\n", + "1. Pass the text from the messages to Gemma with a prompt.\n", + "1. The pipeline requests that the model complete the following two tasks:\n", + " * Attach a sentiment score to the message, by using one of the following three values: `1` for a positive chat, `0` for a neutral chat, and `-1` for a negative chat.\n", + " * Provide a one-sentence summary of the chat.\n", + "1. The pipeline branches, depending on the sentiment score:\n", + " * If the score is `1` or `0`, the chat and its summarization are sent to a data analytics system for storage and future analysis.\n", + " * If the score is `-1`, the Gemma model drafts a response. This response and the chat information are sent to an event messaging system that connects the pipeline and other applications. This step allows a person to review the content of the response. " + ] + }, + { + "cell_type": "markdown", + "id": "nETbaxwZk7us", + "metadata": { + "id": "nETbaxwZk7us" + }, + "source": [ + "## Build the pipeline\n", + "\n", + "This section provides the code needed to run the pipeline.\n", + "\n", + "### Before you begin\n", + "\n", + "Although you can use CPUs for testing and development, for a production Dataflow ML system we recommend that you use GPUs. When you use GPUs with Dataflow, we recommend that you use a custom container. For more information about configuring GPUs and custom containers with Dataflow, see [Best practices for working with Dataflow GPUs](https://cloud.google.com/dataflow/docs/gpu/develop-with-gpus). To faciliate rapid testing of the pipeline, follow the guide [Run a pipeline with GPUs](https://cloud.google.com/dataflow/docs/gpu/use-gpus). The guide includes the steps needed to build the container image.\n", + "\n", + "After you configure your environment, download the model [gemma2_instruct_2b_en](https://www.kaggle.com/models/google/gemma-2/keras) into a folder. In this example, the folder is named `gemma2`." + ] + }, + { + "cell_type": "markdown", + "id": "jMrjYGW9spFG", + "metadata": { + "id": "jMrjYGW9spFG" + }, + "source": [ + "### Build the base image\n", + "\n", + "Add the following Dockerfile to your folder, and then build the base image. Use the Dockerfile to build the image as you create the `pipeline.py` file. The images are broken into two groups to facilitate testing and development." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29dOdG_kxzTv", + "metadata": { + "id": "29dOdG_kxzTv" + }, + "outputs": [], + "source": [ + "ARG SERVING_BUILD_IMAGE=tensorflow/tensorflow:2.16.1-gpu\n", + "\n", + "FROM ${SERVING_BUILD_IMAGE}\n", + "WORKDIR /workspace\n", + "\n", + "COPY gemma2 gemma2\n", + "RUN apt-get update -y && apt-get install -y cmake && apt-get install -y vim" + ] + }, + { + "cell_type": "markdown", + "id": "3eWt8AatpEuG", + "metadata": { + "id": "3eWt8AatpEuG" + }, + "source": [ + "When testing the pipeline code and when launchig the job on Dataflow, test and launch from inside the container. This step prevents dependency mismatches when running the pipeline on Dataflow." + ] + }, + { + "cell_type": "markdown", + "id": "lyS0uYpsoeOW", + "metadata": { + "id": "lyS0uYpsoeOW" + }, + "source": [ + "The `requirements.txt` file contains the following dependencies: " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "K4gJQ0e9pCR4", + "metadata": { + "id": "K4gJQ0e9pCR4" + }, + "outputs": [], + "source": [ + "apache_beam[gcp]==2.54.0\n", + "keras_nlp==0.14.3\n", + "keras==3.4.1\n", + "jax[cuda12]" + ] + }, + { + "cell_type": "markdown", + "id": "k9gGtkqvn6Ar", + "metadata": { + "id": "k9gGtkqvn6Ar" + }, + "source": [ + "The next step includes the files needed to construct the pipeine. The content of the `pipeline.py` file are contained in a later section of this notebook." + ] + }, + { + "cell_type": "markdown", + "id": "aqPS_p3Pp37b", + "metadata": { + "id": "aqPS_p3Pp37b" + }, + "source": [ + "Replace DOCKERFILE_IMAGE with the image that you built using the first Dockerfile." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "EdUxmUjqx58r", + "metadata": { + "id": "EdUxmUjqx58r" + }, + "outputs": [], + "source": [ + "FROM \n", + "\n", + "WORKDIR /workspace\n", + "\n", + "# Copy files from the official SDK image, including the script and dependencies.\n", + "COPY --from=apache/beam_python3.11_sdk:2.54.0 /opt/apache/beam /opt/apache/beam\n", + "\n", + "\n", + "COPY requirements.txt requirements.txt\n", + "RUN pip install --upgrade --no-cache-dir pip \\\n", + " && pip install --no-cache-dir -r requirements.txt\n", + "\n", + "# Copy the model directory downloaded from Kaggle and the pipeline code.\n", + "COPY pipeline.py pipeline.py\n", + "\n", + "# The colab was tested and run with a JAX backend to let Dataflow workers\n", + "# pick up the environment needed to include in the Env of the image.\n", + "ENV KERAS_BACKEND=\"jax\"\n", + "ENV XLA_PYTHON_CLIENT_MEM_FRACTION=\"0.9\"\n", + "\n", + "\n", + "# Set the entrypoint to the Apache Beam SDK launcher.\n", + "ENTRYPOINT [\"/opt/apache/beam/boot\"]" + ] + }, + { + "cell_type": "markdown", + "id": "i63FUxXwsSsO", + "metadata": { + "id": "i63FUxXwsSsO" + }, + "source": [ + "### Run the pipeline\n", + "\n", + "The following code creates and runs the pipeline.\n", + "\n", + "- The `pip install` steps are needed to run the code in the notebook, but aren't needed when running the code in your container.\n", + "\n", + "- Without a GPU, the inference takes a long time to complete." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ebb44382-ee7b-4cec-af67-1fe220cfb40d", + "metadata": { + "id": "ebb44382-ee7b-4cec-af67-1fe220cfb40d", + "tags": [] + }, + "outputs": [], + "source": [ + "pip install apache_beam[gcp]==\"2.54.0\" keras_nlp==\"0.14.3\" keras>=\"3\" jax[cuda12]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "oPgRBScKThZg", + "metadata": { + "id": "oPgRBScKThZg" + }, + "outputs": [], + "source": [ + "import os\n", + "\n", + "os.environ[\"KERAS_BACKEND\"] = \"jax\" # Or \"tensorflow\" or \"torch\".\n", + "os.environ[\"XLA_PYTHON_CLIENT_MEM_FRACTION\"] = \"0.9\"\n", + "\n", + "import keras\n", + "import keras_nlp\n", + "import numpy as np\n", + "import json\n", + "import ast\n", + "import re\n", + "import logging\n", + "\n", + "import apache_beam as beam\n", + "from apache_beam.ml.inference import utils\n", + "from apache_beam.ml.inference.base import RunInference\n", + "from apache_beam.options import pipeline_options\n", + "from apache_beam.options.pipeline_options import GoogleCloudOptions\n", + "from apache_beam.options.pipeline_options import PipelineOptions\n", + "from apache_beam.options.pipeline_options import SetupOptions\n", + "from apache_beam.options.pipeline_options import StandardOptions\n", + "from apache_beam.options.pipeline_options import WorkerOptions\n", + "from apache_beam.ml.inference import utils\n", + "from apache_beam.ml.inference.base import ModelHandler\n", + "from apache_beam.ml.inference.base import PredictionResult\n", + "from apache_beam.ml.inference.base import KeyedModelHandler\n", + "from keras_nlp.models import GemmaCausalLM\n", + "from typing import Any, Dict, Iterable, Optional, Sequence" + ] + }, + { + "cell_type": "markdown", + "id": "0gicDesYWdbu", + "metadata": { + "id": "0gicDesYWdbu" + }, + "source": [ + "Set pipeline options and provide the input Pub/Sub topic. The options that are commented out enable running the pipeline on Google Cloud Dataflow." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "wpG-ltLiTxWM", + "metadata": { + "id": "wpG-ltLiTxWM" + }, + "outputs": [], + "source": [ + "options = PipelineOptions()\n", + "options.view_as(StandardOptions).streaming = True\n", + "options.view_as(SetupOptions).save_main_session = True\n", + "\n", + "# options.view_as(StandardOptions).runner = \"dataflowrunner\"\n", + "# options.view_as(GoogleCloudOptions).project = \n", + "# options.view_as(GoogleCloudOptions).temp_location= \n", + "# options.view_as(GoogleCloudOptions).region= \"us-west1\"\n", + "# options.view_as(WorkerOptions).machine_type= \"g2-standard-4\"\n", + "# options.view_as(WorkerOptions).worker_harness_container_image = \n", + "# options.view_as(WorkerOptions).disk_size_gb=200\n", + "# options.view_as(GoogleCloudOptions).dataflow_service_options=[\"worker_accelerator=type:nvidia-l4;count:1;install-nvidia-driver\"]\n", + "\n", + "topic_reviews=\"\"" + ] + }, + { + "cell_type": "markdown", + "id": "g8sWSMRmW-Ab", + "metadata": { + "id": "g8sWSMRmW-Ab" + }, + "source": [ + "Define a custom model handler that will load the Gemma model and handle inference calls." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "vRVCN3qBUAt9", + "metadata": { + "id": "vRVCN3qBUAt9" + }, + "outputs": [], + "source": [ + "class GemmaModelHandler(ModelHandler[str,\n", + " PredictionResult,GemmaCausalLM\n", + " ]):\n", + " def __init__(\n", + " self,\n", + " model_name: str = \"\",\n", + " ):\n", + " \"\"\" Implementation of the ModelHandler interface for Gemma using text as input.\n", + "\n", + " Example Usage::\n", + "\n", + " pcoll | RunInference(GemmaModelHandler())\n", + "\n", + " Args:\n", + " model_name: The Gemma model uri.\n", + " \"\"\"\n", + " self._model_name = model_name\n", + " self._env_vars = {}\n", + " def share_model_across_processes(self) -> bool:\n", + " \"\"\"Returns whether to share a single model in memory across processes.\n", + "\n", + " This is useful when the loaded model is large, preventing potential\n", + " out-of-memory issues when running the pipeline.\n", + "\n", + " Returns:\n", + " bool\n", + " \"\"\"\n", + " return True\n", + "\n", + " def load_model(self) -> GemmaCausalLM:\n", + " \"\"\"Loads and initializes a model for processing.\"\"\"\n", + " return keras_nlp.models.GemmaCausalLM.from_preset(self._model_name)\n", + "\n", + " def run_inference(\n", + " self,\n", + " batch: Sequence[str],\n", + " model: GemmaCausalLM,\n", + " inference_args: Optional[Dict[str, Any]] = None\n", + " ) -> Iterable[PredictionResult]:\n", + " \"\"\"Runs inferences on a batch of text strings.\n", + "\n", + " Args:\n", + " batch: A sequence of examples as text strings.\n", + " model:\n", + " inference_args: Any additional arguments for an inference.\n", + "\n", + " Returns:\n", + " An Iterable of type PredictionResult.\n", + " \"\"\"\n", + " # Loop each text string, and use a tuple to store the inference results.\n", + " predictions = []\n", + " for one_text in batch:\n", + " result = model.generate(one_text, max_length=1024)\n", + " predictions.append(result)\n", + " return utils._convert_to_result(batch, predictions, self._model_name)" + ] + }, + { + "cell_type": "markdown", + "id": "cSbAFPXmXPMc", + "metadata": { + "id": "cSbAFPXmXPMc" + }, + "source": [ + "We definte a prompt template to format a given input as well as instruct the model on the task being asked of it. This block also has an example input to the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "hqh-Ro5-UNqy", + "metadata": { + "id": "hqh-Ro5-UNqy" + }, + "outputs": [], + "source": [ + "prompt_template = \"\"\"\n", + "\n", + "Provide the results of doing these two tasks on the chat history provided below for the user {}\n", + "task 1 : assess if the tone is happy = 1 , neutral = 0 or unhappy = -1\n", + "task 2 : summarize the text with a maximum of 512 characters\n", + "Return the answer as a JSON string with fields [sentiment, summary] do NOT explain your answer\n", + "\n", + "@@@{}@@@\n", + "\n", + "\"\"\"\n", + "chat_text = \"\"\"\n", + "id 221: Hay I am really annoyed that your menu includes a pizza with pineapple on it!\n", + "id 331: Sorry to hear that , but pineapple is nice on pizza\n", + "id 221: What a terriable thing to say! Its never ok, so unhappy right now!\n", + "\"\"\"\n", + "\n", + "# Example input\n", + "chat =json.dumps({\"id\" : 42, \"user_id\" : 221 , \"chat_message\" : chat_text})\n", + "print(chat)" + ] + }, + { + "cell_type": "markdown", + "id": "8UFfKvSeYn0b", + "metadata": { + "id": "8UFfKvSeYn0b" + }, + "source": [ + "Define pre and post-processing functions. `CreatePrompt` creates a key-value pair of the chat ID and the formatted prompt. `extract_model_reply` parses the response, extracting the JSON string we requested from the model; however, the LLM is not *guaranteed* to return a JSON-formatted object, so we also reaise an exception if the reply is malformed. This helper is then used in the `SentimentAnalysis` `DoFn` to split out the sentiment score as well as the summary of the text. The sentiment score is then used to tag the output, allowing the pipeline to handle different sentiments in different ways." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dDIys7XaUPvl", + "metadata": { + "id": "dDIys7XaUPvl" + }, + "outputs": [], + "source": [ + "keyed_model_handler = KeyedModelHandler(GemmaModelHandler('gemma_2'))\n", + "\n", + "# Create the prompt by using the information from the chat.\n", + "class CreatePrompt(beam.DoFn):\n", + " def process(self, element, *args, **kwargs):\n", + " user_chat = json.loads(element)\n", + " chat_id = user_chat['id']\n", + " user_id = user_chat['user_id']\n", + " messages = user_chat['chat_message']\n", + " yield (chat_id, prompt_template.format(user_id, messages))\n", + "\n", + "def extract_model_reply(model_inference):\n", + " print(model_inference)\n", + " match = re.search(r\"(\\{[\\s\\S]*?\\})\", model_inference)\n", + " json_str = match.group(1)\n", + " print(json_str)\n", + " result = json.loads(json_str)\n", + " if all(key in result for key in ['sentiment', 'summary']):\n", + " return result\n", + " raise Exception('Malformed model reply')\n", + "\n", + " # @title\n", + "class SentimentAnalysis(beam.DoFn):\n", + " def process(self, element):\n", + " key = element[0]\n", + " match = re.search(r\"@@@([\\s\\S]*?)@@@\", element[1].example)\n", + " chats = match.group(1)\n", + "\n", + " try:\n", + " # The result contains the prompt. Replace the prompt with \"\".\n", + " result = extract_model_reply(element[1].inference.replace(element[1].example, \"\"))\n", + " processed_result = (key, chats, result['sentiment'], result['summary'])\n", + "\n", + " if (result['sentiment'] ==-1):\n", + " output = beam.TaggedOutput('negative', processed_result)\n", + " else:\n", + " output = beam.TaggedOutput('main', processed_result)\n", + "\n", + " except Exception as err:\n", + " print(\"ERROR!\" + str(err))\n", + " output = beam.TaggedOutput('error', element)\n", + "\n", + " yield output\n", + "\n", + "gemma_inference = RunInference(keyed_model_handler)" + ] + }, + { + "cell_type": "markdown", + "id": "Yj9aQ0q8YLOn", + "metadata": { + "id": "Yj9aQ0q8YLOn" + }, + "source": [ + "Finally, execute the pipeline using the code below. To use the example chat input created earlier instead of a custom Pub/Sub source, use `chats = p | beam.Create([chat])` instead of the Pub/Sub read." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1fb47a17-9563-46f6-9768-73f4802694e8", + "metadata": { + "id": "1fb47a17-9563-46f6-9768-73f4802694e8", + "tags": [] + }, + "outputs": [], + "source": [ + "\n", + "with beam.Pipeline(options=options) as p:\n", + " chats = (p | \"Read Topic\" >>\n", + " beam.io.ReadFromPubSub(subscription=topic_reviews)\n", + " | \"Parse\" >> beam.Map(lambda x: x.decode(\"utf-8\")))\n", + " prompts = chats | \"Create Prompt\" >> beam.ParDo(CreatePrompt())\n", + " results = prompts | \"RunInference-Gemma\" >> gemma_inference\n", + " filtered_results = results | \"Process Results\" >> beam.ParDo(SentimentAnalysis()).with_outputs('main','negative','error')\n", + " generated_responses = (\n", + " filtered_results.negative\n", + " | \"Generate Response\" >> beam.Map(lambda x: ((x[0], x[3]), \"Generate an appology reponse for the user in this chat text: \" + x[1] + \"\"))\n", + " | \"Gemma-Response\" >> gemma_inference\n", + " )\n", + "\n", + " generated_responses | \"Print Reponse\" >> beam.Map(lambda x: logging.info(x))\n", + " filtered_results.main | \"Print Main\" >> beam.Map(lambda x: logging.info(x))\n", + " filtered_results.error | \"Print Errors\" >> beam.Map(lambda x: logging.info(x))" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "environment": { + "kernel": "apache-beam-2.57.0", + "name": ".m121", + "type": "gcloud", + "uri": "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/:m121" + }, + "kernelspec": { + "display_name": "Apache Beam 2.57.0 (Local)", + "language": "python", + "name": "apache-beam-2.57.0" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From eaa3b56dbb1990526c03de9fe61ff2b8af1e226c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 14 Aug 2024 11:52:21 -0400 Subject: [PATCH 78/78] Bump github.com/nats-io/nats.go from 1.36.0 to 1.37.0 in /sdks (#32174) Bumps [github.com/nats-io/nats.go](https://github.com/nats-io/nats.go) from 1.36.0 to 1.37.0. - [Release notes](https://github.com/nats-io/nats.go/releases) - [Commits](https://github.com/nats-io/nats.go/compare/v1.36.0...v1.37.0) --- updated-dependencies: - dependency-name: github.com/nats-io/nats.go dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- sdks/go.mod | 2 +- sdks/go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sdks/go.mod b/sdks/go.mod index 53aa6f23bfaa2..3a5851cad12f2 100644 --- a/sdks/go.mod +++ b/sdks/go.mod @@ -45,7 +45,7 @@ require ( github.com/lib/pq v1.10.9 github.com/linkedin/goavro/v2 v2.13.0 github.com/nats-io/nats-server/v2 v2.10.18 - github.com/nats-io/nats.go v1.36.0 + github.com/nats-io/nats.go v1.37.0 github.com/proullon/ramsql v0.1.4 github.com/spf13/cobra v1.8.1 github.com/testcontainers/testcontainers-go v0.32.0 diff --git a/sdks/go.sum b/sdks/go.sum index f49007e6b7d9a..284a50039541a 100644 --- a/sdks/go.sum +++ b/sdks/go.sum @@ -1077,8 +1077,8 @@ github.com/nats-io/jwt/v2 v2.5.8 h1:uvdSzwWiEGWGXf+0Q+70qv6AQdvcvxrv9hPM0RiPamE= github.com/nats-io/jwt/v2 v2.5.8/go.mod h1:ZdWS1nZa6WMZfFwwgpEaqBV8EPGVgOTDHN/wTbz0Y5A= github.com/nats-io/nats-server/v2 v2.10.18 h1:tRdZmBuWKVAFYtayqlBB2BuCHNGAQPvoQIXOKwU3WSM= github.com/nats-io/nats-server/v2 v2.10.18/go.mod h1:97Qyg7YydD8blKlR8yBsUlPlWyZKjA7Bp5cl3MUE9K8= -github.com/nats-io/nats.go v1.36.0 h1:suEUPuWzTSse/XhESwqLxXGuj8vGRuPRoG7MoRN/qyU= -github.com/nats-io/nats.go v1.36.0/go.mod h1:Ubdu4Nh9exXdSz0RVWRFBbRfrbSxOYd26oF0wkWclB8= +github.com/nats-io/nats.go v1.37.0 h1:07rauXbVnnJvv1gfIyghFEo6lUcYRY0WXc3x7x0vUxE= +github.com/nats-io/nats.go v1.37.0/go.mod h1:Ubdu4Nh9exXdSz0RVWRFBbRfrbSxOYd26oF0wkWclB8= github.com/nats-io/nkeys v0.4.7 h1:RwNJbbIdYCoClSDNY7QVKZlyb/wfT6ugvFCiKy6vDvI= github.com/nats-io/nkeys v0.4.7/go.mod h1:kqXRgRDPlGy7nGaEDMuYzmiJCIAAWDK0IMBtDmGD0nc= github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw=