Skip to content

Commit

Permalink
1
Browse files Browse the repository at this point in the history
  • Loading branch information
Zouxxyy committed Sep 17, 2024
1 parent 2c45ac0 commit 8b1a37c
Show file tree
Hide file tree
Showing 11 changed files with 221 additions and 52 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,10 @@ public DataField newName(String newName) {
return new DataField(id, newName, type, description);
}

public DataField newType(DataType newType) {
return new DataField(id, name, newType, description);
}

public DataField newDescription(String newDescription) {
return new DataField(id, name, type, newDescription);
}
Expand Down
90 changes: 64 additions & 26 deletions paimon-common/src/main/java/org/apache/paimon/utils/Projection.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,13 @@
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.ListIterator;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

import static org.apache.paimon.types.DataTypeRoot.ROW;

/**
* {@link Projection} represents a list of (possibly nested) indexes that can be used to project
* data types. A row projection includes both reducing the accessible fields and reordering them.
Expand Down Expand Up @@ -238,28 +237,18 @@ private static class NestedProjection extends Projection {

@Override
public RowType project(RowType rowType) {
final List<DataField> updatedFields = new ArrayList<>();
Set<String> nameDomain = new HashSet<>();
int duplicateCount = 0;
for (int[] indexPath : this.projection) {
DataField field = rowType.getFields().get(indexPath[0]);
StringBuilder builder =
new StringBuilder(rowType.getFieldNames().get(indexPath[0]));
for (int index = 1; index < indexPath.length; index++) {
Preconditions.checkArgument(
field.type().getTypeRoot() == ROW, "Row data type expected.");
RowType rowtype = ((RowType) field.type());
builder.append("_").append(rowtype.getFieldNames().get(indexPath[index]));
field = rowtype.getFields().get(indexPath[index]);
}
String path = builder.toString();
while (nameDomain.contains(path)) {
path = builder.append("_$").append(duplicateCount++).toString();
ProjectedDataTypeBuilder builder = new ProjectedDataTypeBuilder(rowType);
for (int[] path : projection) {
ProjectedDataTypeBuilder current = builder;
for (int i = 0; i < path.length; i++) {
current.projectField(path[i]);
if (i == path.length - 1) {
current.fieldBuilder(path[i]).project();
}
current = current.fieldBuilder(path[i]);
}
updatedFields.add(field.newName(path));
nameDomain.add(path);
}
return new RowType(rowType.isNullable(), updatedFields);
return (RowType) builder.build();
}

@Override
Expand Down Expand Up @@ -321,10 +310,7 @@ public Projection complement(int fieldsNumber) {

@Override
public int[] toTopLevelIndexes() {
if (isNested()) {
throw new IllegalStateException(
"Cannot convert a nested projection to a top level projection");
}
// todo: fix it usage
return Arrays.stream(projection).mapToInt(arr -> arr[0]).toArray();
}

Expand Down Expand Up @@ -416,4 +402,56 @@ public int[][] toNestedIndexes() {
return Arrays.stream(projection).mapToObj(i -> new int[] {i}).toArray(int[][]::new);
}
}

private static class ProjectedDataTypeBuilder {
private final DataType dataType;
private boolean projected = false;
private final Set<Integer> projectedFieldIds = new HashSet<>();
private final LinkedList<ProjectedDataTypeBuilder> fieldBuilders = new LinkedList<>();

public ProjectedDataTypeBuilder(DataType dataType) {
this.dataType = dataType;
if (dataType instanceof RowType) {
for (DataField field : ((RowType) dataType).getFields()) {
fieldBuilders.add(new ProjectedDataTypeBuilder(field.type()));
}
}
}

public ProjectedDataTypeBuilder project() {
this.projected = true;
return this;
}

public ProjectedDataTypeBuilder projectField(int fieldId) {
if (!projected) {
this.projectedFieldIds.add(fieldId);
}
return this;
}

public ProjectedDataTypeBuilder fieldBuilder(int fieldId) {
return fieldBuilders.get(fieldId);
}

public DataType build() {
if (projected) {
return dataType.copy();
}

if (!fieldBuilders.isEmpty() && !projectedFieldIds.isEmpty()) {
List<DataField> oldFields = ((RowType) dataType).getFields();
List<DataField> fields = new ArrayList<>(fieldBuilders.size());
for (int i = 0; i < fieldBuilders.size(); i++) {
if (projectedFieldIds.contains(i)) {
DataType newType = fieldBuilders.get(i).build();
fields.add(oldFields.get(i).newType(newType));
}
}
return new RowType(dataType.isNullable(), fields);
} else {
throw new RuntimeException();
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ default ReadBuilder withFilter(List<Predicate> predicates) {
/**
* Apply projection to the reader.
*
* <p>NOTE: Nested row projection is currently not supported.
* <p>todo: update it.
*/
default ReadBuilder withProjection(int[] projection) {
if (projection == null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
import org.apache.paimon.types.RowType;
import org.apache.paimon.utils.Filter;
import org.apache.paimon.utils.Projection;
import org.apache.paimon.utils.TypeUtils;

import java.util.Arrays;
import java.util.Map;
Expand Down Expand Up @@ -65,7 +64,7 @@ public RowType readType() {
if (projection == null) {
return table.rowType();
}
return TypeUtils.project(table.rowType(), Projection.of(projection).toTopLevelIndexes());
return Projection.of(projection).project(table.rowType());
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -370,12 +370,12 @@ private static List<ColumnDescriptor> getAllColumnDescriptorByType(
}

public static List<ParquetField> buildFieldsList(
List<DataField> childrens, List<String> fieldNames, MessageColumnIO columnIO) {
List<DataField> children, List<String> fieldNames, MessageColumnIO columnIO) {
List<ParquetField> list = new ArrayList<>();
for (int i = 0; i < childrens.size(); i++) {
for (int i = 0; i < children.size(); i++) {
list.add(
constructField(
childrens.get(i), lookupColumnByName(columnIO, fieldNames.get(i))));
children.get(i), lookupColumnByName(columnIO, fieldNames.get(i))));
}
return list;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
import org.apache.spark.sql.types.UserDefinedType;

import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Optional;
import java.util.concurrent.atomic.AtomicInteger;
Expand Down Expand Up @@ -183,10 +184,6 @@ public DataType visit(MapType mapType) {
mapType.getValueType().isNullable());
}

/**
* For simplicity, as a temporary solution, we directly convert the non-null attribute to
* nullable on the Spark side.
*/
@Override
public DataType visit(RowType rowType) {
List<StructField> fields = new ArrayList<>(rowType.getFieldCount());
Expand Down Expand Up @@ -333,4 +330,30 @@ public org.apache.paimon.types.DataType atomic(DataType atomic) {
"Not a supported type: " + atomic.catalogString());
}
}

public static int[][] populateProjection(StructType structType, RowType type) {
LinkedList<int[]> projectionList = new LinkedList<>();
populateProjection(structType, type, projectionList, new LinkedList<>());
return projectionList.toArray(new int[0][]);
}

private static void populateProjection(
StructType structType,
RowType rowType,
LinkedList<int[]> projectionList,
LinkedList<Integer> currentPath) {
for (StructField field : structType.fields()) {
currentPath.add(rowType.getFieldIndex(field.name()));
if (field.dataType() instanceof StructType) {
populateProjection(
(StructType) field.dataType(),
(RowType) rowType.getField(field.name()).type(),
projectionList,
currentPath);
} else {
projectionList.add(currentPath.stream().mapToInt(Integer::intValue).toArray());
}
currentPath.removeLast();
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,12 @@ abstract class PaimonBaseScan(
private lazy val tableSchema = SparkTypeUtils.fromPaimonRowType(tableRowType)

private[paimon] val (requiredTableFields, metadataFields) = {
val nameToField = tableSchema.map(field => (field.name, field)).toMap
val _tableFields = requiredSchema.flatMap(field => nameToField.get(field.name))
val _metadataFields =
requiredSchema
.filterNot(field => tableSchema.fieldNames.contains(field.name))
.filter(field => PaimonMetadataColumn.SUPPORTED_METADATA_COLUMNS.contains(field.name))
(_tableFields, _metadataFields)
assert(
requiredSchema.fields.forall(
field =>
tableRowType.containsField(field.name) ||
PaimonMetadataColumn.SUPPORTED_METADATA_COLUMNS.contains(field.name)))
requiredSchema.fields.partition(field => tableRowType.containsField(field.name))
}

protected var runtimeFilters: Array[Filter] = Array.empty
Expand All @@ -82,9 +81,8 @@ abstract class PaimonBaseScan(
lazy val readBuilder: ReadBuilder = {
val _readBuilder = table.newReadBuilder()

val projection =
requiredTableFields.map(field => tableSchema.fieldNames.indexOf(field.name)).toArray
_readBuilder.withProjection(projection)
_readBuilder.withProjection(
SparkTypeUtils.populateProjection(StructType(requiredTableFields), tableRowType))
if (filters.nonEmpty) {
val pushedPredicate = PredicateBuilder.and(filters: _*)
_readBuilder.withFilter(pushedPredicate)
Expand Down Expand Up @@ -114,7 +112,7 @@ abstract class PaimonBaseScan(
}

override def readSchema(): StructType = {
StructType(requiredTableFields ++ metadataFields)
requiredSchema
}

override def toBatch: Batch = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,16 @@ abstract class PaimonBaseScanBuilder(table: Table)
with SupportsPushDownRequiredColumns
with Logging {

protected var requiredSchema: StructType = SparkTypeUtils.fromPaimonRowType(table.rowType())
private var prunedSchema: Option[StructType] = None

protected var pushed: Array[(Filter, Predicate)] = Array.empty
private var pushed: Array[(Filter, Predicate)] = Array.empty

protected var reservedFilters: Array[Filter] = Array.empty

protected var pushDownLimit: Option[Int] = None

override def build(): Scan = {
val requiredSchema = prunedSchema.getOrElse(SparkTypeUtils.fromPaimonRowType(table.rowType))
PaimonScan(table, requiredSchema, pushed.map(_._2), reservedFilters, pushDownLimit)
}

Expand Down Expand Up @@ -87,6 +88,6 @@ abstract class PaimonBaseScanBuilder(table: Table)
}

override def pruneColumns(requiredSchema: StructType): Unit = {
this.requiredSchema = requiredSchema
this.prunedSchema = Some(requiredSchema)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
import scala.collection.JavaConverters;

import static org.apache.paimon.data.BinaryString.fromString;
import static org.apache.paimon.spark.SparkTypeTest.ALL_TYPES;
import static org.apache.paimon.spark.SparkTypeUtilsTest.ALL_TYPES;
import static org.assertj.core.api.Assertions.assertThat;

/** Test for {@link SparkInternalRow}. */
Expand Down
Loading

0 comments on commit 8b1a37c

Please sign in to comment.