Skip to content

Commit

Permalink
[Feature][Connector-v2] Support streaming read for paimon (#7681)
Browse files Browse the repository at this point in the history
  • Loading branch information
dailai committed Sep 20, 2024
1 parent 4f5d27f commit 4a2e272
Show file tree
Hide file tree
Showing 16 changed files with 883 additions and 237 deletions.
29 changes: 25 additions & 4 deletions docs/en/connector-v2/source/Paimon.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ Read data from Apache Paimon.
## Key features

- [x] [batch](../../concept/connector-v2-features.md)
- [ ] [stream](../../concept/connector-v2-features.md)
- [x] [stream](../../concept/connector-v2-features.md)
- [ ] [exactly-once](../../concept/connector-v2-features.md)
- [ ] [column projection](../../concept/connector-v2-features.md)
- [ ] [parallelism](../../concept/connector-v2-features.md)
Expand Down Expand Up @@ -157,9 +157,30 @@ source {
```

## Changelog
If you want to read the changelog of this connector, your sink table of paimon which mast has the options named `changelog-producer=input`, then you can refer to [Paimon changelog](https://paimon.apache.org/docs/master/primary-key-table/changelog-producer/).
Currently, we only support the `input` and `none` mode of changelog producer. If the changelog producer is `input`, the streaming read of the connector will generate -U,+U,+I,+D data. But if the changelog producer is `none`, the streaming read of the connector will generate +I,+U,+D data.

### next version
### Streaming read example
```hocon
env {
parallelism = 1
job.mode = "Streaming"
}
- Add Paimon Source Connector
- Support projection for Paimon Source
source {
Paimon {
warehouse = "/tmp/paimon"
database = "full_type"
table = "st_test"
}
}
sink {
Paimon {
warehouse = "/tmp/paimon"
database = "full_type"
table = "st_test_sink"
paimon.table.primary-keys = "c_tinyint"
}
}
```
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

package org.apache.seatunnel.connectors.seatunnel.paimon.source;

import org.apache.seatunnel.api.common.JobContext;
import org.apache.seatunnel.api.configuration.ReadonlyConfig;
import org.apache.seatunnel.api.source.Boundedness;
import org.apache.seatunnel.api.source.SeaTunnelSource;
Expand All @@ -26,18 +27,23 @@
import org.apache.seatunnel.api.table.catalog.TablePath;
import org.apache.seatunnel.api.table.type.SeaTunnelRow;
import org.apache.seatunnel.api.table.type.SeaTunnelRowType;
import org.apache.seatunnel.common.constants.JobMode;
import org.apache.seatunnel.connectors.seatunnel.paimon.catalog.PaimonCatalog;
import org.apache.seatunnel.connectors.seatunnel.paimon.config.PaimonSourceConfig;
import org.apache.seatunnel.connectors.seatunnel.paimon.source.converter.SqlToPaimonPredicateConverter;
import org.apache.seatunnel.connectors.seatunnel.paimon.source.enumerator.PaimonBatchSourceSplitEnumerator;
import org.apache.seatunnel.connectors.seatunnel.paimon.source.enumerator.PaimonStreamSourceSplitEnumerator;
import org.apache.seatunnel.connectors.seatunnel.paimon.utils.RowTypeConverter;

import org.apache.paimon.predicate.Predicate;
import org.apache.paimon.table.Table;
import org.apache.paimon.table.source.ReadBuilder;
import org.apache.paimon.types.RowType;

import net.sf.jsqlparser.statement.select.PlainSelect;

import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.Objects;

Expand All @@ -58,12 +64,12 @@ public class PaimonSource

private Table paimonTable;

private Predicate predicate;

private int[] projectionIndex;
private JobContext jobContext;

private CatalogTable catalogTable;

protected final ReadBuilder readBuilder;

public PaimonSource(ReadonlyConfig readonlyConfig, PaimonCatalog paimonCatalog) {
this.readonlyConfig = readonlyConfig;
PaimonSourceConfig paimonSourceConfig = new PaimonSourceConfig(readonlyConfig);
Expand All @@ -76,17 +82,22 @@ public PaimonSource(ReadonlyConfig readonlyConfig, PaimonCatalog paimonCatalog)
PlainSelect plainSelect = convertToPlainSelect(filterSql);
RowType paimonRowType = this.paimonTable.rowType();
String[] filedNames = paimonRowType.getFieldNames().toArray(new String[0]);

Predicate predicate = null;
int[] projectionIndex = null;
if (!Objects.isNull(plainSelect)) {
this.projectionIndex = convertSqlSelectToPaimonProjectionIndex(filedNames, plainSelect);
projectionIndex = convertSqlSelectToPaimonProjectionIndex(filedNames, plainSelect);
if (!Objects.isNull(projectionIndex)) {
this.catalogTable =
paimonCatalog.getTableWithProjection(tablePath, projectionIndex);
}
this.predicate =
predicate =
SqlToPaimonPredicateConverter.convertSqlWhereToPaimonPredicate(
paimonRowType, plainSelect);
}
seaTunnelRowType = RowTypeConverter.convert(paimonRowType, projectionIndex);
this.seaTunnelRowType = RowTypeConverter.convert(paimonRowType, projectionIndex);
this.readBuilder =
paimonTable.newReadBuilder().withProjection(projectionIndex).withFilter(predicate);
}

@Override
Expand All @@ -99,31 +110,54 @@ public List<CatalogTable> getProducedCatalogTables() {
return Collections.singletonList(catalogTable);
}

@Override
public void setJobContext(JobContext jobContext) {
this.jobContext = jobContext;
}

@Override
public Boundedness getBoundedness() {
return Boundedness.BOUNDED;
return JobMode.BATCH.equals(jobContext.getJobMode())
? Boundedness.BOUNDED
: Boundedness.UNBOUNDED;
}

@Override
public SourceReader<SeaTunnelRow, PaimonSourceSplit> createReader(
SourceReader.Context readerContext) throws Exception {
return new PaimonSourceReader(
readerContext, paimonTable, seaTunnelRowType, predicate, projectionIndex);
readerContext, paimonTable, seaTunnelRowType, readBuilder.newRead());
}

@Override
public SourceSplitEnumerator<PaimonSourceSplit, PaimonSourceState> createEnumerator(
SourceSplitEnumerator.Context<PaimonSourceSplit> enumeratorContext) throws Exception {
return new PaimonSourceSplitEnumerator(
enumeratorContext, paimonTable, predicate, projectionIndex);
if (getBoundedness() == Boundedness.BOUNDED) {
return new PaimonBatchSourceSplitEnumerator(
enumeratorContext, new LinkedList<>(), null, readBuilder.newScan(), 1);
}
return new PaimonStreamSourceSplitEnumerator(
enumeratorContext, new LinkedList<>(), null, readBuilder.newStreamScan(), 1);
}

@Override
public SourceSplitEnumerator<PaimonSourceSplit, PaimonSourceState> restoreEnumerator(
SourceSplitEnumerator.Context<PaimonSourceSplit> enumeratorContext,
PaimonSourceState checkpointState)
throws Exception {
return new PaimonSourceSplitEnumerator(
enumeratorContext, paimonTable, checkpointState, predicate, projectionIndex);
if (getBoundedness() == Boundedness.BOUNDED) {
return new PaimonBatchSourceSplitEnumerator(
enumeratorContext,
checkpointState.getAssignedSplits(),
checkpointState.getCurrentSnapshotId(),
readBuilder.newScan(),
1);
}
return new PaimonStreamSourceSplitEnumerator(
enumeratorContext,
checkpointState.getAssignedSplits(),
checkpointState.getCurrentSnapshotId(),
readBuilder.newStreamScan(),
1);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,18 +17,21 @@

package org.apache.seatunnel.connectors.seatunnel.paimon.source;

import org.apache.seatunnel.api.source.Boundedness;
import org.apache.seatunnel.api.source.Collector;
import org.apache.seatunnel.api.source.SourceReader;
import org.apache.seatunnel.api.table.type.RowKind;
import org.apache.seatunnel.api.table.type.SeaTunnelRow;
import org.apache.seatunnel.api.table.type.SeaTunnelRowType;
import org.apache.seatunnel.connectors.seatunnel.paimon.utils.RowConverter;
import org.apache.seatunnel.connectors.seatunnel.paimon.utils.RowKindConverter;

import org.apache.paimon.data.InternalRow;
import org.apache.paimon.predicate.Predicate;
import org.apache.paimon.reader.RecordReader;
import org.apache.paimon.reader.RecordReaderIterator;
import org.apache.paimon.table.FileStoreTable;
import org.apache.paimon.table.Table;
import org.apache.paimon.table.source.TableRead;

import lombok.extern.slf4j.Slf4j;

Expand All @@ -48,20 +51,14 @@ public class PaimonSourceReader implements SourceReader<SeaTunnelRow, PaimonSour
private final Table table;
private final SeaTunnelRowType seaTunnelRowType;
private volatile boolean noMoreSplit;
private final Predicate predicate;
private int[] projection;
private final TableRead tableRead;

public PaimonSourceReader(
Context context,
Table table,
SeaTunnelRowType seaTunnelRowType,
Predicate predicate,
int[] projection) {
Context context, Table table, SeaTunnelRowType seaTunnelRowType, TableRead tableRead) {
this.context = context;
this.table = table;
this.seaTunnelRowType = seaTunnelRowType;
this.predicate = predicate;
this.projection = projection;
this.tableRead = tableRead;
}

@Override
Expand All @@ -81,29 +78,39 @@ public void pollNext(Collector<SeaTunnelRow> output) throws Exception {
if (Objects.nonNull(split)) {
// read logic
try (final RecordReader<InternalRow> reader =
table.newReadBuilder()
.withProjection(projection)
.withFilter(predicate)
.newRead()
.executeFilter()
.createReader(split.getSplit())) {
tableRead.executeFilter().createReader(split.getSplit())) {
final RecordReaderIterator<InternalRow> rowIterator =
new RecordReaderIterator<>(reader);
while (rowIterator.hasNext()) {
final InternalRow row = rowIterator.next();
final SeaTunnelRow seaTunnelRow =
RowConverter.convert(
row, seaTunnelRowType, ((FileStoreTable) table).schema());
if (Boundedness.UNBOUNDED.equals(context.getBoundedness())) {
RowKind rowKind =
RowKindConverter.convertPaimonRowKind2SeatunnelRowkind(
row.getRowKind());
if (rowKind != null) {
seaTunnelRow.setRowKind(rowKind);
}
}
output.collect(seaTunnelRow);
}
}
} else if (noMoreSplit && sourceSplits.isEmpty()) {
}

if (noMoreSplit
&& sourceSplits.isEmpty()
&& Boundedness.BOUNDED.equals(context.getBoundedness())) {
// signal to the source that we have reached the end of the data.
log.info("Closed the bounded flink table store source");
log.info("Closed the bounded table store source");
context.signalNoMoreElement();
} else {
log.warn("Waiting for flink table source split, sleeping 1s");
Thread.sleep(1000L);
context.sendSplitRequest();
if (sourceSplits.isEmpty()) {
log.debug("Waiting for table source split, sleeping 1s");
Thread.sleep(1000L);
}
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,22 +21,22 @@

import org.apache.paimon.table.source.Split;

import lombok.AllArgsConstructor;
import lombok.Getter;

/** Paimon source split, wrapped the {@link Split} of paimon table. */
@Getter
@AllArgsConstructor
public class PaimonSourceSplit implements SourceSplit {
private static final long serialVersionUID = 1L;

private final Split split;
/** The unique ID of the split. Unique within the scope of this source. */
private final String id;

public PaimonSourceSplit(Split split) {
this.split = split;
}
private final Split split;

@Override
public String splitId() {
return split.toString();
}

public Split getSplit() {
return split;
}
}
Loading

0 comments on commit 4a2e272

Please sign in to comment.