diff --git a/paimon-core/src/main/java/org/apache/paimon/operation/DataEvolutionFileStoreScan.java b/paimon-core/src/main/java/org/apache/paimon/operation/DataEvolutionFileStoreScan.java index 24dc01c47bf6..514afff296bb 100644 --- a/paimon-core/src/main/java/org/apache/paimon/operation/DataEvolutionFileStoreScan.java +++ b/paimon-core/src/main/java/org/apache/paimon/operation/DataEvolutionFileStoreScan.java @@ -34,6 +34,7 @@ import org.apache.paimon.stats.SimpleStats; import org.apache.paimon.table.SpecialFields; import org.apache.paimon.types.DataField; +import org.apache.paimon.types.DataType; import org.apache.paimon.types.RowType; import org.apache.paimon.utils.Pair; import org.apache.paimon.utils.Range; @@ -257,14 +258,15 @@ static EvolutionStats evolutionStats( continue; } int targetFieldId = allFields[j]; + DataType targetType = schema.fields().get(j).type(); for (int fieldId : fieldIds) { if (targetFieldId == fieldId) { for (int k = 0; k < fieldIdsWithStats.length; k++) { if (fieldId == fieldIdsWithStats[k]) { - // TODO: If type not match (e.g. int -> string), we need to skip - // this, set rowOffsets[j] = -1 always. (may -2, after all, set it - // back to -1) Because schema evolution may happen to change int to - // string or something like that. + DataType fileType = dataFileSchemaWithStats.fields().get(k).type(); + if (!fileType.equalsIgnoreFieldId(targetType)) { + continue loop1; + } rowOffsets[j] = i; fieldOffsets[j] = k; continue loop1; diff --git a/paimon-core/src/test/java/org/apache/paimon/operation/DataEvolutionFileStoreScanTest.java b/paimon-core/src/test/java/org/apache/paimon/operation/DataEvolutionFileStoreScanTest.java index d23f74993179..803e4ac2559f 100644 --- a/paimon-core/src/test/java/org/apache/paimon/operation/DataEvolutionFileStoreScanTest.java +++ b/paimon-core/src/test/java/org/apache/paimon/operation/DataEvolutionFileStoreScanTest.java @@ -260,6 +260,56 @@ public void testEvolutionStatsWithWriteColsNotEqualToValueStatsCols() { assertThat(nullCounts.isNullAt(2)).isTrue(); } + @Test + public void testEvolutionStatsSkipsStatsAfterColumnTypeChange() { + Schema baseSchema = createSchema("f0", "f1"); + TableSchema baseTableSchema = TableSchema.create(0L, baseSchema); + schemas.put(0L, baseTableSchema); + + Schema evolvedSchema = + Schema.newBuilder() + .column("f0", DataTypes.STRING()) + .column("f1", DataTypes.STRING()) + .build(); + TableSchema evolvedTableSchema = TableSchema.create(1L, evolvedSchema); + schemas.put(1L, evolvedTableSchema); + + ManifestEntry oldTypeEntry = + createManifestEntry( + 0L, + createSimpleStats( + GenericRow.of(10, BinaryString.fromString("a")), + GenericRow.of(99, BinaryString.fromString("z")), + createBinaryArray(new int[] {0, 0}), + new int[] {0, 1})); + + BinaryRow newTypeMin = new BinaryRow(2); + BinaryRowWriter newTypeMinWriter = new BinaryRowWriter(newTypeMin); + newTypeMinWriter.writeString(0, BinaryString.fromString("apple")); + newTypeMinWriter.writeString(1, BinaryString.fromString("banana")); + newTypeMinWriter.complete(); + BinaryRow newTypeMax = new BinaryRow(2); + BinaryRowWriter newTypeMaxWriter = new BinaryRowWriter(newTypeMax); + newTypeMaxWriter.writeString(0, BinaryString.fromString("yam")); + newTypeMaxWriter.writeString(1, BinaryString.fromString("zebra")); + newTypeMaxWriter.complete(); + SimpleStats newTypeStats = + new SimpleStats(newTypeMin, newTypeMax, createBinaryArray(new int[] {0, 0})); + ManifestEntry newTypeEntry = createManifestEntry(1L, newTypeStats); + + EvolutionStats result = + DataEvolutionFileStoreScan.evolutionStats( + evolvedTableSchema, + scanTableSchema, + Arrays.asList(oldTypeEntry, newTypeEntry)); + + DataEvolutionRow minRow = (DataEvolutionRow) result.minValues(); + DataEvolutionRow maxRow = (DataEvolutionRow) result.maxValues(); + + assertThat(minRow.getString(0).toString()).isEqualTo("apple"); + assertThat(maxRow.getString(0).toString()).isEqualTo("yam"); + } + @Test public void testIntersectsRowRanges() { List rowRanges =