Skip to content

Commit

Permalink
[WIP][feature][spark] Support streaming
Browse files Browse the repository at this point in the history
  • Loading branch information
CheneyYin committed Aug 27, 2024
1 parent 62b4f16 commit 6392a37
Show file tree
Hide file tree
Showing 32 changed files with 1,052 additions and 182 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import java.io.Serializable;
import java.nio.ByteBuffer;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;

Expand All @@ -30,6 +31,8 @@ public final class SeaTunnelRow implements Serializable {
private String tableId = "";
/** The kind of change that a row describes in a changelog. */
private RowKind rowKind = RowKind.INSERT;

private Map<String, String> metadata = new HashMap<>();
/** The array to store the actual internal format values. */
private final Object[] fields;

Expand Down Expand Up @@ -67,6 +70,14 @@ public RowKind getRowKind() {
return this.rowKind;
}

public Map<String, String> getMetadata() {
return this.metadata;
}

public void setMetadata(Map<String, String> metadata) {
this.metadata = metadata;
}

public Object[] getFields() {
return fields;
}
Expand All @@ -81,6 +92,7 @@ public SeaTunnelRow copy() {
SeaTunnelRow newRow = new SeaTunnelRow(newFields);
newRow.setRowKind(this.getRowKind());
newRow.setTableId(this.getTableId());
newRow.setMetadata(this.getMetadata());
return newRow;
}

Expand All @@ -92,6 +104,7 @@ public SeaTunnelRow copy(int[] indexMapping) {
SeaTunnelRow newRow = new SeaTunnelRow(newFields);
newRow.setRowKind(this.getRowKind());
newRow.setTableId(this.getTableId());
newRow.setMetadata(this.getMetadata());
return newRow;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ public final class Constants {

public static final String CHECKPOINT_ID = "checkpoint.id";

public static final String CHECKPOINT_LOCATION = "checkpointLocation";

public static final String UUID = "uuid";

public static final String NOW = "now";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@

import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.streaming.StreamingQuery;
import org.apache.spark.sql.streaming.StreamingQueryException;

import java.net.URL;
import java.util.ArrayList;
Expand Down Expand Up @@ -132,9 +134,22 @@ public List<DatasetTableInfo> execute(List<DatasetTableInfo> upstreamDataStreams
sparkRuntimeEnvironment.getSparkSession().sparkContext().applicationId();
CatalogTable[] catalogTables =
datasetTableInfo.getCatalogTables().toArray(new CatalogTable[0]);
SparkSinkInjector.inject(dataset.write(), sink, catalogTables, applicationId)
.option("checkpointLocation", "/tmp")
.save();
if (isStreaming()) {
StreamingQuery streamingQuery =
SparkSinkInjector.inject(
dataset.writeStream(), sink, catalogTables, applicationId)
.option("checkpointLocation", "/tmp/test-spark-seatunnel")
.start();
try {
streamingQuery.awaitTermination();
} catch (StreamingQueryException e) {
throw new RuntimeException(e);
}
} else {
SparkSinkInjector.inject(dataset.write(), sink, catalogTables, applicationId)
.option("checkpointLocation", "/tmp/test-spark-seatunnel")
.save();
}
}
// the sink is the last stream
return null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,15 @@
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.streaming.OutputMode;
import org.apache.spark.sql.streaming.StreamingQuery;
import org.apache.spark.sql.streaming.StreamingQueryException;

import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.concurrent.TimeoutException;
import java.util.stream.Collectors;

import static org.apache.seatunnel.api.common.SeaTunnelAPIErrorCode.HANDLE_SAVE_MODE_FAILED;
Expand Down Expand Up @@ -132,10 +136,29 @@ public List<DatasetTableInfo> execute(List<DatasetTableInfo> upstreamDataStreams
sparkRuntimeEnvironment.getStreamingContext().sparkContext().applicationId();
CatalogTable[] catalogTables =
datasetTableInfo.getCatalogTables().toArray(new CatalogTable[0]);
SparkSinkInjector.inject(dataset.write(), sink, catalogTables, applicationId)
.option("checkpointLocation", "/tmp")
.mode(SaveMode.Append)
.save();
if (isStreaming()) {
try {
StreamingQuery streamingQuery =
SparkSinkInjector.inject(
dataset.writeStream(),
sink,
catalogTables,
applicationId)
.option("checkpointLocation", "/tmp/test-spark-seatunnel")
.outputMode(OutputMode.Append())
.start();
streamingQuery.awaitTermination();
} catch (TimeoutException e) {
throw new RuntimeException(e);
} catch (StreamingQueryException e) {
throw new RuntimeException(e);
}
} else {
SparkSinkInjector.inject(dataset.write(), sink, catalogTables, applicationId)
.option("checkpointLocation", "/tmp/test-spark-seatunnel")
.mode(SaveMode.Append)
.save();
}
}
// the sink is the last stream
return null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@

import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;

import com.google.common.collect.Lists;

Expand Down Expand Up @@ -86,17 +87,31 @@ public List<DatasetTableInfo> execute(List<DatasetTableInfo> upstreamDataStreams
CommonOptions.PARALLELISM.key(),
CommonOptions.PARALLELISM.defaultValue());
}
Dataset<Row> dataset =
sparkRuntimeEnvironment
.getSparkSession()
.read()
.format(SeaTunnelSource.class.getSimpleName())
.option(CommonOptions.PARALLELISM.key(), parallelism)
.option(
Constants.SOURCE_SERIALIZATION,
SerializationUtils.objectToString(source))
.options(envOption)
.load();
SparkSession sparkSession = sparkRuntimeEnvironment.getSparkSession();
Dataset<Row> dataset = null;
if (isStreaming()) {
dataset =
sparkSession
.readStream()
.format(SeaTunnelSource.class.getSimpleName())
.option(CommonOptions.PARALLELISM.key(), parallelism)
.option(
Constants.SOURCE_SERIALIZATION,
SerializationUtils.objectToString(source))
.options(envOption)
.load();
} else {
dataset =
sparkSession
.read()
.format(SeaTunnelSource.class.getSimpleName())
.option(CommonOptions.PARALLELISM.key(), parallelism)
.option(
Constants.SOURCE_SERIALIZATION,
SerializationUtils.objectToString(source))
.options(envOption)
.load();
}
sources.add(
new DatasetTableInfo(
dataset,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
import org.apache.seatunnel.shade.com.typesafe.config.Config;

import org.apache.seatunnel.api.common.JobContext;
import org.apache.seatunnel.api.env.EnvCommonOptions;
import org.apache.seatunnel.common.constants.JobMode;
import org.apache.seatunnel.common.utils.SeaTunnelException;
import org.apache.seatunnel.core.starter.execution.PluginExecuteProcessor;
import org.apache.seatunnel.translation.spark.execution.DatasetTableInfo;
Expand Down Expand Up @@ -51,6 +53,16 @@ protected SparkAbstractPluginExecuteProcessor(
this.plugins = initializePlugins(pluginConfigs);
}

public boolean isStreaming() {
if (sparkRuntimeEnvironment.getConfig().hasPath(EnvCommonOptions.JOB_MODE.key())) {
return sparkRuntimeEnvironment
.getConfig()
.getString(EnvCommonOptions.JOB_MODE.key())
.equalsIgnoreCase(JobMode.STREAMING.name());
}
return false;
}

@Override
public void setRuntimeEnvironment(SparkRuntimeEnvironment sparkRuntimeEnvironment) {
this.sparkRuntimeEnvironment = sparkRuntimeEnvironment;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ public Offset deserializeOffset(String microBatchState) {
@Override
public void commit(Offset end) {
// nothing
System.out.println(end);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,13 @@ public class SeaTunnelSinkTable implements Table, SupportsWrite {
private final SeaTunnelSink<SeaTunnelRow, ?, ?, ?> sink;

private final CatalogTable[] catalogTables;
private final String checkpointLocation;
private final String jobId;

public SeaTunnelSinkTable(Map<String, String> properties) {
this.properties = properties;
this.checkpointLocation =
properties.getOrDefault(Constants.CHECKPOINT_LOCATION, "/tmp/seatunnel");
String sinkSerialization = properties.getOrDefault(Constants.SINK_SERIALIZATION, "");
if (StringUtils.isBlank(sinkSerialization)) {
throw new IllegalArgumentException(Constants.SINK_SERIALIZATION + " must be specified");
Expand All @@ -68,7 +71,8 @@ public SeaTunnelSinkTable(Map<String, String> properties) {

@Override
public WriteBuilder newWriteBuilder(LogicalWriteInfo info) {
return new SeaTunnelWriteBuilder<>(sink, catalogTables, jobId);
return new SeaTunnelWriteBuilder<>(
sink, properties, catalogTables, checkpointLocation, jobId);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,14 @@
* limitations under the License.
*/

package org.apache.seatunnel.translation.spark.sink;
package org.apache.seatunnel.translation.spark.sink.write;

import org.apache.seatunnel.api.sink.MultiTableResourceManager;
import org.apache.seatunnel.api.sink.SeaTunnelSink;
import org.apache.seatunnel.api.sink.SinkAggregatedCommitter;
import org.apache.seatunnel.api.sink.SupportResourceShare;
import org.apache.seatunnel.api.table.catalog.CatalogTable;
import org.apache.seatunnel.api.table.type.SeaTunnelRow;
import org.apache.seatunnel.translation.spark.sink.write.SeaTunnelSparkDataWriterFactory;
import org.apache.seatunnel.translation.spark.sink.write.SeaTunnelSparkWriterCommitMessage;

import org.apache.spark.sql.connector.write.BatchWrite;
import org.apache.spark.sql.connector.write.DataWriterFactory;
Expand All @@ -37,6 +35,7 @@
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.stream.Collectors;

Expand All @@ -48,18 +47,24 @@ public class SeaTunnelBatchWrite<StateT, CommitInfoT, AggregatedCommitInfoT>
private final SinkAggregatedCommitter<CommitInfoT, AggregatedCommitInfoT> aggregatedCommitter;

private MultiTableResourceManager resourceManager;
private final Map<String, String> properties;

private final CatalogTable[] catalogTables;
private final String checkpointLocation;

private final String jobId;

public SeaTunnelBatchWrite(
SeaTunnelSink<SeaTunnelRow, StateT, CommitInfoT, AggregatedCommitInfoT> sink,
Map<String, String> properties,
CatalogTable[] catalogTables,
String checkpointLocation,
String jobId)
throws IOException {
this.sink = sink;
this.properties = properties;
this.catalogTables = catalogTables;
this.checkpointLocation = checkpointLocation;
this.jobId = jobId;
this.aggregatedCommitter = sink.createAggregatedCommitter().orElse(null);
if (aggregatedCommitter != null) {
Expand All @@ -78,7 +83,8 @@ public SeaTunnelBatchWrite(

@Override
public DataWriterFactory createBatchWriterFactory(PhysicalWriteInfo info) {
return new SeaTunnelSparkDataWriterFactory<>(sink, catalogTables, jobId);
return new SeaTunnelSparkDataWriterFactory<>(
sink, properties, catalogTables, checkpointLocation, jobId);
}

@Override
Expand Down
Loading

0 comments on commit 6392a37

Please sign in to comment.