Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[core][spark][WIP] Support nested row projection pushdown #4201

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,10 @@ public DataField newName(String newName) {
return new DataField(id, newName, type, description);
}

public DataField newType(DataType newType) {
return new DataField(id, name, newType, description);
}

public DataField newDescription(String newDescription) {
return new DataField(id, name, type, newDescription);
}
Expand Down
102 changes: 73 additions & 29 deletions paimon-common/src/main/java/org/apache/paimon/utils/Projection.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,14 @@
import java.lang.reflect.Array;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Collections;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.ListIterator;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

import static org.apache.paimon.types.DataTypeRoot.ROW;

/**
* {@link Projection} represents a list of (possibly nested) indexes that can be used to project
* data types. A row projection includes both reducing the accessible fields and reordering them.
Expand Down Expand Up @@ -238,28 +237,18 @@ private static class NestedProjection extends Projection {

@Override
public RowType project(RowType rowType) {
final List<DataField> updatedFields = new ArrayList<>();
Set<String> nameDomain = new HashSet<>();
int duplicateCount = 0;
for (int[] indexPath : this.projection) {
DataField field = rowType.getFields().get(indexPath[0]);
StringBuilder builder =
new StringBuilder(rowType.getFieldNames().get(indexPath[0]));
for (int index = 1; index < indexPath.length; index++) {
Preconditions.checkArgument(
field.type().getTypeRoot() == ROW, "Row data type expected.");
RowType rowtype = ((RowType) field.type());
builder.append("_").append(rowtype.getFieldNames().get(indexPath[index]));
field = rowtype.getFields().get(indexPath[index]);
}
String path = builder.toString();
while (nameDomain.contains(path)) {
path = builder.append("_$").append(duplicateCount++).toString();
ProjectedDataTypeBuilder builder = new ProjectedDataTypeBuilder(rowType);
for (int[] path : projection) {
ProjectedDataTypeBuilder current = builder;
for (int i = 0; i < path.length; i++) {
current.projectField(path[i]);
if (i == path.length - 1) {
current.fieldBuilder(path[i]).project();
}
current = current.fieldBuilder(path[i]);
}
updatedFields.add(field.newName(path));
nameDomain.add(path);
}
return new RowType(rowType.isNullable(), updatedFields);
return (RowType) builder.build();
}

@Override
Expand Down Expand Up @@ -321,11 +310,11 @@ public Projection complement(int fieldsNumber) {

@Override
public int[] toTopLevelIndexes() {
if (isNested()) {
throw new IllegalStateException(
"Cannot convert a nested projection to a top level projection");
}
return Arrays.stream(projection).mapToInt(arr -> arr[0]).toArray();
return Arrays.stream(projection)
.map(arr -> arr[0])
.distinct()
.mapToInt(Integer::intValue)
.toArray();
}

@Override
Expand Down Expand Up @@ -416,4 +405,59 @@ public int[][] toNestedIndexes() {
return Arrays.stream(projection).mapToObj(i -> new int[] {i}).toArray(int[][]::new);
}
}

private static class ProjectedDataTypeBuilder {
private final DataType dataType;
private boolean projected = false;
private final LinkedHashSet<Integer> projectedFieldIds = new LinkedHashSet<>();
private final LinkedList<ProjectedDataTypeBuilder> fieldBuilders = new LinkedList<>();

public ProjectedDataTypeBuilder(DataType dataType) {
this.dataType = dataType;
if (dataType instanceof RowType) {
for (DataField field : ((RowType) dataType).getFields()) {
fieldBuilders.add(new ProjectedDataTypeBuilder(field.type()));
}
}
}

public ProjectedDataTypeBuilder project() {
this.projected = true;
return this;
}

public ProjectedDataTypeBuilder projectField(int fieldId) {
if (!projected) {
this.projectedFieldIds.add(fieldId);
}
return this;
}

public ProjectedDataTypeBuilder fieldBuilder(int fieldId) {
return fieldBuilders.get(fieldId);
}

public DataType build() {
if (projected) {
return dataType.copy();
}

if (fieldBuilders.isEmpty()) {
// can't reach here
throw new RuntimeException();
}

if (projectedFieldIds.isEmpty()) {
return new RowType(dataType.isNullable(), Collections.emptyList());
} else {
List<DataField> oldFields = ((RowType) dataType).getFields();
List<DataField> fields = new ArrayList<>(fieldBuilders.size());
for (Integer i : projectedFieldIds) {
DataType newType = fieldBuilders.get(i).build();
fields.add(oldFields.get(i).newType(newType));
}
return new RowType(dataType.isNullable(), fields);
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ default ReadBuilder withFilter(List<Predicate> predicates) {
/**
* Apply projection to the reader.
*
* <p>NOTE: Nested row projection is currently not supported.
* <p>todo: update it.
*/
default ReadBuilder withProjection(int[] projection) {
if (projection == null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
import org.apache.paimon.types.RowType;
import org.apache.paimon.utils.Filter;
import org.apache.paimon.utils.Projection;
import org.apache.paimon.utils.TypeUtils;

import java.util.Arrays;
import java.util.Map;
Expand Down Expand Up @@ -65,7 +64,7 @@ public RowType readType() {
if (projection == null) {
return table.rowType();
}
return TypeUtils.project(table.rowType(), Projection.of(projection).toTopLevelIndexes());
return Projection.of(projection).project(table.rowType());
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -370,12 +370,12 @@ private static List<ColumnDescriptor> getAllColumnDescriptorByType(
}

public static List<ParquetField> buildFieldsList(
List<DataField> childrens, List<String> fieldNames, MessageColumnIO columnIO) {
List<DataField> children, List<String> fieldNames, MessageColumnIO columnIO) {
List<ParquetField> list = new ArrayList<>();
for (int i = 0; i < childrens.size(); i++) {
for (int i = 0; i < children.size(); i++) {
list.add(
constructField(
childrens.get(i), lookupColumnByName(columnIO, fieldNames.get(i))));
children.get(i), lookupColumnByName(columnIO, fieldNames.get(i))));
}
return list;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
import org.apache.spark.sql.types.UserDefinedType;

import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Optional;
import java.util.concurrent.atomic.AtomicInteger;
Expand Down Expand Up @@ -183,10 +184,6 @@ public DataType visit(MapType mapType) {
mapType.getValueType().isNullable());
}

/**
* For simplicity, as a temporary solution, we directly convert the non-null attribute to
* nullable on the Spark side.
*/
@Override
public DataType visit(RowType rowType) {
List<StructField> fields = new ArrayList<>(rowType.getFieldCount());
Expand Down Expand Up @@ -333,4 +330,30 @@ public org.apache.paimon.types.DataType atomic(DataType atomic) {
"Not a supported type: " + atomic.catalogString());
}
}

public static int[][] populateProjection(StructType structType, RowType type) {
LinkedList<int[]> projectionList = new LinkedList<>();
populateProjection(structType, type, projectionList, new LinkedList<>());
return projectionList.toArray(new int[0][]);
}

private static void populateProjection(
StructType structType,
RowType rowType,
LinkedList<int[]> projectionList,
LinkedList<Integer> currentPath) {
for (StructField field : structType.fields()) {
currentPath.add(rowType.getFieldIndex(field.name()));
if (field.dataType() instanceof StructType) {
populateProjection(
(StructType) field.dataType(),
(RowType) rowType.getField(field.name()).type(),
projectionList,
currentPath);
} else {
projectionList.add(currentPath.stream().mapToInt(Integer::intValue).toArray());
}
currentPath.removeLast();
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,12 @@ abstract class PaimonBaseScan(
private lazy val tableSchema = SparkTypeUtils.fromPaimonRowType(tableRowType)

private[paimon] val (requiredTableFields, metadataFields) = {
val nameToField = tableSchema.map(field => (field.name, field)).toMap
val _tableFields = requiredSchema.flatMap(field => nameToField.get(field.name))
val _metadataFields =
requiredSchema
.filterNot(field => tableSchema.fieldNames.contains(field.name))
.filter(field => PaimonMetadataColumn.SUPPORTED_METADATA_COLUMNS.contains(field.name))
(_tableFields, _metadataFields)
assert(
requiredSchema.fields.forall(
field =>
tableRowType.containsField(field.name) ||
PaimonMetadataColumn.SUPPORTED_METADATA_COLUMNS.contains(field.name)))
requiredSchema.fields.partition(field => tableRowType.containsField(field.name))
}

protected var runtimeFilters: Array[Filter] = Array.empty
Expand All @@ -82,9 +81,8 @@ abstract class PaimonBaseScan(
lazy val readBuilder: ReadBuilder = {
val _readBuilder = table.newReadBuilder()

val projection =
requiredTableFields.map(field => tableSchema.fieldNames.indexOf(field.name)).toArray
_readBuilder.withProjection(projection)
_readBuilder.withProjection(
SparkTypeUtils.populateProjection(StructType(requiredTableFields), tableRowType))
if (filters.nonEmpty) {
val pushedPredicate = PredicateBuilder.and(filters: _*)
_readBuilder.withFilter(pushedPredicate)
Expand Down Expand Up @@ -114,7 +112,7 @@ abstract class PaimonBaseScan(
}

override def readSchema(): StructType = {
StructType(requiredTableFields ++ metadataFields)
requiredSchema
}

override def toBatch: Batch = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,16 @@ abstract class PaimonBaseScanBuilder(table: Table)
with SupportsPushDownRequiredColumns
with Logging {

protected var requiredSchema: StructType = SparkTypeUtils.fromPaimonRowType(table.rowType())
private var prunedSchema: Option[StructType] = None

protected var pushed: Array[(Filter, Predicate)] = Array.empty
private var pushed: Array[(Filter, Predicate)] = Array.empty

protected var reservedFilters: Array[Filter] = Array.empty

protected var pushDownLimit: Option[Int] = None

override def build(): Scan = {
val requiredSchema = prunedSchema.getOrElse(SparkTypeUtils.fromPaimonRowType(table.rowType))
PaimonScan(table, requiredSchema, pushed.map(_._2), reservedFilters, pushDownLimit)
}

Expand Down Expand Up @@ -87,6 +88,6 @@ abstract class PaimonBaseScanBuilder(table: Table)
}

override def pruneColumns(requiredSchema: StructType): Unit = {
this.requiredSchema = requiredSchema
this.prunedSchema = Some(requiredSchema)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
import scala.collection.JavaConverters;

import static org.apache.paimon.data.BinaryString.fromString;
import static org.apache.paimon.spark.SparkTypeTest.ALL_TYPES;
import static org.apache.paimon.spark.SparkTypeUtilsTest.ALL_TYPES;
import static org.assertj.core.api.Assertions.assertThat;

/** Test for {@link SparkInternalRow}. */
Expand Down
Loading
Loading