Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP][feature][spark] Support streaming #7476

Draft
wants to merge 2 commits into
base: dev
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import java.io.Serializable;
import java.nio.ByteBuffer;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;

Expand All @@ -30,6 +31,8 @@ public final class SeaTunnelRow implements Serializable {
private String tableId = "";
/** The kind of change that a row describes in a changelog. */
private RowKind rowKind = RowKind.INSERT;

private Map<String, String> metadata = new HashMap<>();
/** The array to store the actual internal format values. */
private final Object[] fields;

Expand Down Expand Up @@ -67,6 +70,14 @@ public RowKind getRowKind() {
return this.rowKind;
}

public Map<String, String> getMetadata() {
return this.metadata;
}

public void setMetadata(Map<String, String> metadata) {
this.metadata = metadata;
}

public Object[] getFields() {
return fields;
}
Expand All @@ -81,6 +92,7 @@ public SeaTunnelRow copy() {
SeaTunnelRow newRow = new SeaTunnelRow(newFields);
newRow.setRowKind(this.getRowKind());
newRow.setTableId(this.getTableId());
newRow.setMetadata(this.getMetadata());
return newRow;
}

Expand All @@ -92,6 +104,7 @@ public SeaTunnelRow copy(int[] indexMapping) {
SeaTunnelRow newRow = new SeaTunnelRow(newFields);
newRow.setRowKind(this.getRowKind());
newRow.setTableId(this.getTableId());
newRow.setMetadata(this.getMetadata());
return newRow;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ public final class Constants {

public static final String CHECKPOINT_ID = "checkpoint.id";

public static final String CHECKPOINT_LOCATION = "checkpointLocation";

public static final String UUID = "uuid";

public static final String NOW = "now";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@

import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.streaming.StreamingQuery;
import org.apache.spark.sql.streaming.StreamingQueryException;

import java.net.URL;
import java.util.ArrayList;
Expand Down Expand Up @@ -124,9 +126,22 @@ public List<DatasetTableInfo> execute(List<DatasetTableInfo> upstreamDataStreams
sparkRuntimeEnvironment.getSparkSession().sparkContext().applicationId();
CatalogTable[] catalogTables =
datasetTableInfo.getCatalogTables().toArray(new CatalogTable[0]);
SparkSinkInjector.inject(dataset.write(), sink, catalogTables, applicationId)
.option("checkpointLocation", "/tmp")
.save();
if (isStreaming()) {
StreamingQuery streamingQuery =
SparkSinkInjector.inject(
dataset.writeStream(), sink, catalogTables, applicationId)
.option("checkpointLocation", "/tmp/test-spark-seatunnel")
.start();
try {
streamingQuery.awaitTermination();
} catch (StreamingQueryException e) {
throw new RuntimeException(e);
}
} else {
SparkSinkInjector.inject(dataset.write(), sink, catalogTables, applicationId)
.option("checkpointLocation", "/tmp/test-spark-seatunnel")
.save();
}
}
// the sink is the last stream
return null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,16 @@
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.streaming.OutputMode;
import org.apache.spark.sql.streaming.StreamingQuery;
import org.apache.spark.sql.streaming.StreamingQueryException;

import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.TimeoutException;
import java.util.stream.Collectors;

import static org.apache.seatunnel.api.common.SeaTunnelAPIErrorCode.HANDLE_SAVE_MODE_FAILED;
Expand Down Expand Up @@ -124,10 +128,29 @@ public List<DatasetTableInfo> execute(List<DatasetTableInfo> upstreamDataStreams
sparkRuntimeEnvironment.getStreamingContext().sparkContext().applicationId();
CatalogTable[] catalogTables =
datasetTableInfo.getCatalogTables().toArray(new CatalogTable[0]);
SparkSinkInjector.inject(dataset.write(), sink, catalogTables, applicationId)
.option("checkpointLocation", "/tmp")
.mode(SaveMode.Append)
.save();
if (isStreaming()) {
try {
StreamingQuery streamingQuery =
SparkSinkInjector.inject(
dataset.writeStream(),
sink,
catalogTables,
applicationId)
.option("checkpointLocation", "/tmp/test-spark-seatunnel")
.outputMode(OutputMode.Append())
.start();
streamingQuery.awaitTermination();
} catch (TimeoutException e) {
throw new RuntimeException(e);
} catch (StreamingQueryException e) {
throw new RuntimeException(e);
}
} else {
SparkSinkInjector.inject(dataset.write(), sink, catalogTables, applicationId)
.option("checkpointLocation", "/tmp/test-spark-seatunnel")
.mode(SaveMode.Append)
.save();
}
}
// the sink is the last stream
return null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@

import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;

import com.google.common.collect.Lists;

Expand Down Expand Up @@ -86,17 +87,31 @@ public List<DatasetTableInfo> execute(List<DatasetTableInfo> upstreamDataStreams
CommonOptions.PARALLELISM.key(),
CommonOptions.PARALLELISM.defaultValue());
}
Dataset<Row> dataset =
sparkRuntimeEnvironment
.getSparkSession()
.read()
.format(SeaTunnelSource.class.getSimpleName())
.option(CommonOptions.PARALLELISM.key(), parallelism)
.option(
Constants.SOURCE_SERIALIZATION,
SerializationUtils.objectToString(source))
.options(envOption)
.load();
SparkSession sparkSession = sparkRuntimeEnvironment.getSparkSession();
Dataset<Row> dataset = null;
if (isStreaming()) {
dataset =
sparkSession
.readStream()
.format(SeaTunnelSource.class.getSimpleName())
.option(CommonOptions.PARALLELISM.key(), parallelism)
.option(
Constants.SOURCE_SERIALIZATION,
SerializationUtils.objectToString(source))
.options(envOption)
.load();
} else {
dataset =
sparkSession
.read()
.format(SeaTunnelSource.class.getSimpleName())
.option(CommonOptions.PARALLELISM.key(), parallelism)
.option(
Constants.SOURCE_SERIALIZATION,
SerializationUtils.objectToString(source))
.options(envOption)
.load();
}
sources.add(
new DatasetTableInfo(
dataset,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
import org.apache.seatunnel.shade.com.typesafe.config.Config;

import org.apache.seatunnel.api.common.JobContext;
import org.apache.seatunnel.api.env.EnvCommonOptions;
import org.apache.seatunnel.common.constants.JobMode;
import org.apache.seatunnel.common.utils.SeaTunnelException;
import org.apache.seatunnel.core.starter.execution.PluginExecuteProcessor;
import org.apache.seatunnel.translation.spark.execution.DatasetTableInfo;
Expand Down Expand Up @@ -51,6 +53,16 @@ protected SparkAbstractPluginExecuteProcessor(
this.plugins = initializePlugins(pluginConfigs);
}

public boolean isStreaming() {
if (sparkRuntimeEnvironment.getConfig().hasPath(EnvCommonOptions.JOB_MODE.key())) {
return sparkRuntimeEnvironment
.getConfig()
.getString(EnvCommonOptions.JOB_MODE.key())
.equalsIgnoreCase(JobMode.STREAMING.name());
}
return false;
}

@Override
public void setRuntimeEnvironment(SparkRuntimeEnvironment sparkRuntimeEnvironment) {
this.sparkRuntimeEnvironment = sparkRuntimeEnvironment;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import org.apache.seatunnel.core.starter.execution.PluginUtil;
import org.apache.seatunnel.plugin.discovery.seatunnel.SeaTunnelFactoryDiscovery;
import org.apache.seatunnel.plugin.discovery.seatunnel.SeaTunnelTransformPluginDiscovery;
import org.apache.seatunnel.translation.spark.execution.CheckpointMetadata;
import org.apache.seatunnel.translation.spark.execution.DatasetTableInfo;
import org.apache.seatunnel.translation.spark.serialization.SeaTunnelRowConverter;
import org.apache.seatunnel.translation.spark.utils.TypeConverterUtils;
Expand Down Expand Up @@ -180,19 +181,19 @@ private Dataset<Row> sparkTransform(SeaTunnelTransform transform, DatasetTableIn
private static class TransformIterator implements Iterator<Row>, Serializable {
private Iterator<Row> sourceIterator;
private SeaTunnelTransform<SeaTunnelRow> transform;
private StructType structType;
private StructType outputSchema;
private SeaTunnelRowConverter inputRowConverter;
private SeaTunnelRowConverter outputRowConverter;

public TransformIterator(
Iterator<Row> sourceIterator,
SeaTunnelTransform<SeaTunnelRow> transform,
StructType structType,
StructType outputSchema,
SeaTunnelRowConverter inputRowConverter,
SeaTunnelRowConverter outputRowConverter) {
this.sourceIterator = sourceIterator;
this.transform = transform;
this.structType = structType;
this.outputSchema = outputSchema;
this.inputRowConverter = inputRowConverter;
this.outputRowConverter = outputRowConverter;
}
Expand All @@ -207,6 +208,16 @@ public Row next() {
try {
Row row = sourceIterator.next();
SeaTunnelRow seaTunnelRow = inputRowConverter.unpack((GenericRowWithSchema) row);
if (CheckpointMetadata.of(seaTunnelRow.getMetadata()).isCheckpoint()) {
/*
* Skip checkpoint event
*/
return outputRowConverter.checkpointEvent(
outputSchema,
seaTunnelRow.getRowKind(),
seaTunnelRow.getTableId(),
CheckpointMetadata.of(seaTunnelRow.getMetadata()));
}
seaTunnelRow = (SeaTunnelRow) transform.map(seaTunnelRow);
if (seaTunnelRow == null) {
return null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ public Offset deserializeOffset(String microBatchState) {
@Override
public void commit(Offset end) {
// nothing
System.out.println(end);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,13 @@ public class SeaTunnelSinkTable implements Table, SupportsWrite {
private final SeaTunnelSink<SeaTunnelRow, ?, ?, ?> sink;

private final CatalogTable[] catalogTables;
private final String checkpointLocation;
private final String jobId;

public SeaTunnelSinkTable(Map<String, String> properties) {
this.properties = properties;
this.checkpointLocation =
properties.getOrDefault(Constants.CHECKPOINT_LOCATION, "/tmp/seatunnel");
String sinkSerialization = properties.getOrDefault(Constants.SINK_SERIALIZATION, "");
if (StringUtils.isBlank(sinkSerialization)) {
throw new IllegalArgumentException(Constants.SINK_SERIALIZATION + " must be specified");
Expand All @@ -68,7 +71,8 @@ public SeaTunnelSinkTable(Map<String, String> properties) {

@Override
public WriteBuilder newWriteBuilder(LogicalWriteInfo info) {
return new SeaTunnelWriteBuilder<>(sink, catalogTables, jobId);
return new SeaTunnelWriteBuilder<>(
sink, properties, catalogTables, info.schema(), checkpointLocation, jobId);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,28 +15,28 @@
* limitations under the License.
*/

package org.apache.seatunnel.translation.spark.sink;
package org.apache.seatunnel.translation.spark.sink.write;

import org.apache.seatunnel.api.sink.MultiTableResourceManager;
import org.apache.seatunnel.api.sink.SeaTunnelSink;
import org.apache.seatunnel.api.sink.SinkAggregatedCommitter;
import org.apache.seatunnel.api.sink.SupportResourceShare;
import org.apache.seatunnel.api.table.catalog.CatalogTable;
import org.apache.seatunnel.api.table.type.SeaTunnelRow;
import org.apache.seatunnel.translation.spark.sink.write.SeaTunnelSparkDataWriterFactory;
import org.apache.seatunnel.translation.spark.sink.write.SeaTunnelSparkWriterCommitMessage;

import org.apache.spark.sql.connector.write.BatchWrite;
import org.apache.spark.sql.connector.write.DataWriterFactory;
import org.apache.spark.sql.connector.write.PhysicalWriteInfo;
import org.apache.spark.sql.connector.write.WriterCommitMessage;
import org.apache.spark.sql.connector.write.streaming.StreamingDataWriterFactory;
import org.apache.spark.sql.connector.write.streaming.StreamingWrite;
import org.apache.spark.sql.types.StructType;

import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.stream.Collectors;

Expand All @@ -48,18 +48,27 @@ public class SeaTunnelBatchWrite<StateT, CommitInfoT, AggregatedCommitInfoT>
private final SinkAggregatedCommitter<CommitInfoT, AggregatedCommitInfoT> aggregatedCommitter;

private MultiTableResourceManager resourceManager;
private final Map<String, String> properties;

private final CatalogTable[] catalogTables;
private final StructType schema;
private final String checkpointLocation;

private final String jobId;

public SeaTunnelBatchWrite(
SeaTunnelSink<SeaTunnelRow, StateT, CommitInfoT, AggregatedCommitInfoT> sink,
Map<String, String> properties,
CatalogTable[] catalogTables,
StructType schema,
String checkpointLocation,
String jobId)
throws IOException {
this.sink = sink;
this.properties = properties;
this.catalogTables = catalogTables;
this.schema = schema;
this.checkpointLocation = checkpointLocation;
this.jobId = jobId;
this.aggregatedCommitter = sink.createAggregatedCommitter().orElse(null);
if (aggregatedCommitter != null) {
Expand All @@ -78,7 +87,8 @@ public SeaTunnelBatchWrite(

@Override
public DataWriterFactory createBatchWriterFactory(PhysicalWriteInfo info) {
return new SeaTunnelSparkDataWriterFactory<>(sink, catalogTables, jobId);
return new SeaTunnelSparkDataWriterFactory<>(
sink, properties, catalogTables, schema, checkpointLocation, jobId);
}

@Override
Expand Down
Loading
Loading