Skip to content

Commit

Permalink
[WIP][feature][spark] Support streaming
Browse files Browse the repository at this point in the history
  • Loading branch information
CheneyYin committed Aug 29, 2024
1 parent 62b4f16 commit d3d78b3
Show file tree
Hide file tree
Showing 35 changed files with 1,436 additions and 358 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import java.io.Serializable;
import java.nio.ByteBuffer;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;

Expand All @@ -30,6 +31,8 @@ public final class SeaTunnelRow implements Serializable {
private String tableId = "";
/** The kind of change that a row describes in a changelog. */
private RowKind rowKind = RowKind.INSERT;

private Map<String, String> metadata = new HashMap<>();
/** The array to store the actual internal format values. */
private final Object[] fields;

Expand Down Expand Up @@ -67,6 +70,14 @@ public RowKind getRowKind() {
return this.rowKind;
}

public Map<String, String> getMetadata() {
return this.metadata;
}

public void setMetadata(Map<String, String> metadata) {
this.metadata = metadata;
}

public Object[] getFields() {
return fields;
}
Expand All @@ -81,6 +92,7 @@ public SeaTunnelRow copy() {
SeaTunnelRow newRow = new SeaTunnelRow(newFields);
newRow.setRowKind(this.getRowKind());
newRow.setTableId(this.getTableId());
newRow.setMetadata(this.getMetadata());
return newRow;
}

Expand All @@ -92,6 +104,7 @@ public SeaTunnelRow copy(int[] indexMapping) {
SeaTunnelRow newRow = new SeaTunnelRow(newFields);
newRow.setRowKind(this.getRowKind());
newRow.setTableId(this.getTableId());
newRow.setMetadata(this.getMetadata());
return newRow;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ public final class Constants {

public static final String CHECKPOINT_ID = "checkpoint.id";

public static final String CHECKPOINT_LOCATION = "checkpointLocation";

public static final String UUID = "uuid";

public static final String NOW = "now";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@

import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.streaming.StreamingQuery;
import org.apache.spark.sql.streaming.StreamingQueryException;

import java.net.URL;
import java.util.ArrayList;
Expand Down Expand Up @@ -132,9 +134,22 @@ public List<DatasetTableInfo> execute(List<DatasetTableInfo> upstreamDataStreams
sparkRuntimeEnvironment.getSparkSession().sparkContext().applicationId();
CatalogTable[] catalogTables =
datasetTableInfo.getCatalogTables().toArray(new CatalogTable[0]);
SparkSinkInjector.inject(dataset.write(), sink, catalogTables, applicationId)
.option("checkpointLocation", "/tmp")
.save();
if (isStreaming()) {
StreamingQuery streamingQuery =
SparkSinkInjector.inject(
dataset.writeStream(), sink, catalogTables, applicationId)
.option("checkpointLocation", "/tmp/test-spark-seatunnel")
.start();
try {
streamingQuery.awaitTermination();
} catch (StreamingQueryException e) {
throw new RuntimeException(e);
}
} else {
SparkSinkInjector.inject(dataset.write(), sink, catalogTables, applicationId)
.option("checkpointLocation", "/tmp/test-spark-seatunnel")
.save();
}
}
// the sink is the last stream
return null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,15 @@
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.streaming.OutputMode;
import org.apache.spark.sql.streaming.StreamingQuery;
import org.apache.spark.sql.streaming.StreamingQueryException;

import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.concurrent.TimeoutException;
import java.util.stream.Collectors;

import static org.apache.seatunnel.api.common.SeaTunnelAPIErrorCode.HANDLE_SAVE_MODE_FAILED;
Expand Down Expand Up @@ -132,10 +136,29 @@ public List<DatasetTableInfo> execute(List<DatasetTableInfo> upstreamDataStreams
sparkRuntimeEnvironment.getStreamingContext().sparkContext().applicationId();
CatalogTable[] catalogTables =
datasetTableInfo.getCatalogTables().toArray(new CatalogTable[0]);
SparkSinkInjector.inject(dataset.write(), sink, catalogTables, applicationId)
.option("checkpointLocation", "/tmp")
.mode(SaveMode.Append)
.save();
if (isStreaming()) {
try {
StreamingQuery streamingQuery =
SparkSinkInjector.inject(
dataset.writeStream(),
sink,
catalogTables,
applicationId)
.option("checkpointLocation", "/tmp/test-spark-seatunnel")
.outputMode(OutputMode.Append())
.start();
streamingQuery.awaitTermination();
} catch (TimeoutException e) {
throw new RuntimeException(e);
} catch (StreamingQueryException e) {
throw new RuntimeException(e);
}
} else {
SparkSinkInjector.inject(dataset.write(), sink, catalogTables, applicationId)
.option("checkpointLocation", "/tmp/test-spark-seatunnel")
.mode(SaveMode.Append)
.save();
}
}
// the sink is the last stream
return null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@

import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;

import com.google.common.collect.Lists;

Expand Down Expand Up @@ -86,17 +87,31 @@ public List<DatasetTableInfo> execute(List<DatasetTableInfo> upstreamDataStreams
CommonOptions.PARALLELISM.key(),
CommonOptions.PARALLELISM.defaultValue());
}
Dataset<Row> dataset =
sparkRuntimeEnvironment
.getSparkSession()
.read()
.format(SeaTunnelSource.class.getSimpleName())
.option(CommonOptions.PARALLELISM.key(), parallelism)
.option(
Constants.SOURCE_SERIALIZATION,
SerializationUtils.objectToString(source))
.options(envOption)
.load();
SparkSession sparkSession = sparkRuntimeEnvironment.getSparkSession();
Dataset<Row> dataset = null;
if (isStreaming()) {
dataset =
sparkSession
.readStream()
.format(SeaTunnelSource.class.getSimpleName())
.option(CommonOptions.PARALLELISM.key(), parallelism)
.option(
Constants.SOURCE_SERIALIZATION,
SerializationUtils.objectToString(source))
.options(envOption)
.load();
} else {
dataset =
sparkSession
.read()
.format(SeaTunnelSource.class.getSimpleName())
.option(CommonOptions.PARALLELISM.key(), parallelism)
.option(
Constants.SOURCE_SERIALIZATION,
SerializationUtils.objectToString(source))
.options(envOption)
.load();
}
sources.add(
new DatasetTableInfo(
dataset,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
import org.apache.seatunnel.shade.com.typesafe.config.Config;

import org.apache.seatunnel.api.common.JobContext;
import org.apache.seatunnel.api.env.EnvCommonOptions;
import org.apache.seatunnel.common.constants.JobMode;
import org.apache.seatunnel.common.utils.SeaTunnelException;
import org.apache.seatunnel.core.starter.execution.PluginExecuteProcessor;
import org.apache.seatunnel.translation.spark.execution.DatasetTableInfo;
Expand Down Expand Up @@ -51,6 +53,16 @@ protected SparkAbstractPluginExecuteProcessor(
this.plugins = initializePlugins(pluginConfigs);
}

public boolean isStreaming() {
if (sparkRuntimeEnvironment.getConfig().hasPath(EnvCommonOptions.JOB_MODE.key())) {
return sparkRuntimeEnvironment
.getConfig()
.getString(EnvCommonOptions.JOB_MODE.key())
.equalsIgnoreCase(JobMode.STREAMING.name());
}
return false;
}

@Override
public void setRuntimeEnvironment(SparkRuntimeEnvironment sparkRuntimeEnvironment) {
this.sparkRuntimeEnvironment = sparkRuntimeEnvironment;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import org.apache.seatunnel.core.starter.execution.PluginUtil;
import org.apache.seatunnel.plugin.discovery.seatunnel.SeaTunnelFactoryDiscovery;
import org.apache.seatunnel.plugin.discovery.seatunnel.SeaTunnelTransformPluginDiscovery;
import org.apache.seatunnel.translation.spark.execution.CheckpointMetadata;
import org.apache.seatunnel.translation.spark.execution.DatasetTableInfo;
import org.apache.seatunnel.translation.spark.serialization.SeaTunnelRowConverter;
import org.apache.seatunnel.translation.spark.utils.TypeConverterUtils;
Expand Down Expand Up @@ -163,19 +164,19 @@ private Dataset<Row> sparkTransform(SeaTunnelTransform transform, DatasetTableIn
private static class TransformIterator implements Iterator<Row>, Serializable {
private Iterator<Row> sourceIterator;
private SeaTunnelTransform<SeaTunnelRow> transform;
private StructType structType;
private StructType outputSchema;
private SeaTunnelRowConverter inputRowConverter;
private SeaTunnelRowConverter outputRowConverter;

public TransformIterator(
Iterator<Row> sourceIterator,
SeaTunnelTransform<SeaTunnelRow> transform,
StructType structType,
StructType outputSchema,
SeaTunnelRowConverter inputRowConverter,
SeaTunnelRowConverter outputRowConverter) {
this.sourceIterator = sourceIterator;
this.transform = transform;
this.structType = structType;
this.outputSchema = outputSchema;
this.inputRowConverter = inputRowConverter;
this.outputRowConverter = outputRowConverter;
}
Expand All @@ -190,6 +191,16 @@ public Row next() {
try {
Row row = sourceIterator.next();
SeaTunnelRow seaTunnelRow = inputRowConverter.unpack((GenericRowWithSchema) row);
if (CheckpointMetadata.of(seaTunnelRow.getMetadata()).isCheckpoint()) {
/*
* Skip checkpoint event
*/
return outputRowConverter.checkpointEvent(
outputSchema,
seaTunnelRow.getRowKind(),
seaTunnelRow.getTableId(),
CheckpointMetadata.of(seaTunnelRow.getMetadata()));
}
seaTunnelRow = (SeaTunnelRow) transform.map(seaTunnelRow);
if (seaTunnelRow == null) {
return null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ public Offset deserializeOffset(String microBatchState) {
@Override
public void commit(Offset end) {
// nothing
System.out.println(end);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,13 @@ public class SeaTunnelSinkTable implements Table, SupportsWrite {
private final SeaTunnelSink<SeaTunnelRow, ?, ?, ?> sink;

private final CatalogTable[] catalogTables;
private final String checkpointLocation;
private final String jobId;

public SeaTunnelSinkTable(Map<String, String> properties) {
this.properties = properties;
this.checkpointLocation =
properties.getOrDefault(Constants.CHECKPOINT_LOCATION, "/tmp/seatunnel");
String sinkSerialization = properties.getOrDefault(Constants.SINK_SERIALIZATION, "");
if (StringUtils.isBlank(sinkSerialization)) {
throw new IllegalArgumentException(Constants.SINK_SERIALIZATION + " must be specified");
Expand All @@ -68,7 +71,8 @@ public SeaTunnelSinkTable(Map<String, String> properties) {

@Override
public WriteBuilder newWriteBuilder(LogicalWriteInfo info) {
return new SeaTunnelWriteBuilder<>(sink, catalogTables, jobId);
return new SeaTunnelWriteBuilder<>(
sink, properties, catalogTables, info.schema(), checkpointLocation, jobId);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,28 +15,28 @@
* limitations under the License.
*/

package org.apache.seatunnel.translation.spark.sink;
package org.apache.seatunnel.translation.spark.sink.write;

import org.apache.seatunnel.api.sink.MultiTableResourceManager;
import org.apache.seatunnel.api.sink.SeaTunnelSink;
import org.apache.seatunnel.api.sink.SinkAggregatedCommitter;
import org.apache.seatunnel.api.sink.SupportResourceShare;
import org.apache.seatunnel.api.table.catalog.CatalogTable;
import org.apache.seatunnel.api.table.type.SeaTunnelRow;
import org.apache.seatunnel.translation.spark.sink.write.SeaTunnelSparkDataWriterFactory;
import org.apache.seatunnel.translation.spark.sink.write.SeaTunnelSparkWriterCommitMessage;

import org.apache.spark.sql.connector.write.BatchWrite;
import org.apache.spark.sql.connector.write.DataWriterFactory;
import org.apache.spark.sql.connector.write.PhysicalWriteInfo;
import org.apache.spark.sql.connector.write.WriterCommitMessage;
import org.apache.spark.sql.connector.write.streaming.StreamingDataWriterFactory;
import org.apache.spark.sql.connector.write.streaming.StreamingWrite;
import org.apache.spark.sql.types.StructType;

import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.stream.Collectors;

Expand All @@ -48,18 +48,27 @@ public class SeaTunnelBatchWrite<StateT, CommitInfoT, AggregatedCommitInfoT>
private final SinkAggregatedCommitter<CommitInfoT, AggregatedCommitInfoT> aggregatedCommitter;

private MultiTableResourceManager resourceManager;
private final Map<String, String> properties;

private final CatalogTable[] catalogTables;
private final StructType schema;
private final String checkpointLocation;

private final String jobId;

public SeaTunnelBatchWrite(
SeaTunnelSink<SeaTunnelRow, StateT, CommitInfoT, AggregatedCommitInfoT> sink,
Map<String, String> properties,
CatalogTable[] catalogTables,
StructType schema,
String checkpointLocation,
String jobId)
throws IOException {
this.sink = sink;
this.properties = properties;
this.catalogTables = catalogTables;
this.schema = schema;
this.checkpointLocation = checkpointLocation;
this.jobId = jobId;
this.aggregatedCommitter = sink.createAggregatedCommitter().orElse(null);
if (aggregatedCommitter != null) {
Expand All @@ -78,7 +87,8 @@ public SeaTunnelBatchWrite(

@Override
public DataWriterFactory createBatchWriterFactory(PhysicalWriteInfo info) {
return new SeaTunnelSparkDataWriterFactory<>(sink, catalogTables, jobId);
return new SeaTunnelSparkDataWriterFactory<>(
sink, properties, catalogTables, schema, checkpointLocation, jobId);
}

@Override
Expand Down
Loading

0 comments on commit d3d78b3

Please sign in to comment.