use of org.apache.spark.sql.connector.distributions.Distribution in project iceberg by apache.
the class SparkDistributionAndOrderingUtil method buildPositionMergeDistribution.
private static Distribution buildPositionMergeDistribution(Table table, DistributionMode distributionMode) {
switch(distributionMode) {
case NONE:
return Distributions.unspecified();
case HASH:
if (table.spec().isUnpartitioned()) {
Expression[] clustering = new Expression[] { SPEC_ID, PARTITION, FILE_PATH };
return Distributions.clustered(clustering);
} else {
Distribution dataDistribution = buildRequiredDistribution(table, distributionMode);
Expression[] dataClustering = ((ClusteredDistribution) dataDistribution).clustering();
Expression[] deleteClustering = new Expression[] { SPEC_ID, PARTITION };
Expression[] clustering = ObjectArrays.concat(deleteClustering, dataClustering, Expression.class);
return Distributions.clustered(clustering);
}
case RANGE:
Distribution dataDistribution = buildRequiredDistribution(table, distributionMode);
SortOrder[] dataOrdering = ((OrderedDistribution) dataDistribution).ordering();
SortOrder[] deleteOrdering = new SortOrder[] { SPEC_ID_ORDER, PARTITION_ORDER, FILE_PATH_ORDER };
SortOrder[] ordering = ObjectArrays.concat(deleteOrdering, dataOrdering, SortOrder.class);
return Distributions.ordered(ordering);
default:
throw new IllegalArgumentException("Unexpected distribution mode: " + distributionMode);
}
}
use of org.apache.spark.sql.connector.distributions.Distribution in project iceberg by apache.
the class TestSparkDistributionAndOrderingUtil method testRangeCopyOnWriteDeleteUnpartitionedSortedTable.
@Test
public void testRangeCopyOnWriteDeleteUnpartitionedSortedTable() {
sql("CREATE TABLE %s (id bigint, data string) USING iceberg", tableName);
Table table = validationCatalog.loadTable(tableIdent);
table.updateProperties().set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit();
table.replaceSortOrder().asc("id").asc("data").commit();
SortOrder[] expectedOrdering = new SortOrder[] { Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) };
Distribution expectedDistribution = Distributions.ordered(expectedOrdering);
checkCopyOnWriteDistributionAndOrdering(table, DELETE, expectedDistribution, expectedOrdering);
}
use of org.apache.spark.sql.connector.distributions.Distribution in project iceberg by apache.
the class TestSparkDistributionAndOrderingUtil method testRangeCopyOnWriteDeleteUnpartitionedUnsortedTable.
@Test
public void testRangeCopyOnWriteDeleteUnpartitionedUnsortedTable() {
sql("CREATE TABLE %s (id bigint, data string) USING iceberg", tableName);
Table table = validationCatalog.loadTable(tableIdent);
table.updateProperties().set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit();
Distribution expectedDistribution = Distributions.ordered(FILE_POSITION_ORDERING);
checkCopyOnWriteDistributionAndOrdering(table, DELETE, expectedDistribution, FILE_POSITION_ORDERING);
}
use of org.apache.spark.sql.connector.distributions.Distribution in project iceberg by apache.
the class TestSparkDistributionAndOrderingUtil method testHashWritePartitionedSortedTable.
@Test
public void testHashWritePartitionedSortedTable() {
sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + "USING iceberg " + "PARTITIONED BY (date, bucket(8, data))", tableName);
Table table = validationCatalog.loadTable(tableIdent);
table.updateProperties().set(WRITE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit();
table.replaceSortOrder().asc("id").commit();
Expression[] expectedClustering = new Expression[] { Expressions.identity("date"), Expressions.bucket(8, "data") };
Distribution expectedDistribution = Distributions.clustered(expectedClustering);
SortOrder[] expectedOrdering = new SortOrder[] { Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), Expressions.sort(Expressions.bucket(8, "data"), SortDirection.ASCENDING), Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) };
checkWriteDistributionAndOrdering(table, expectedDistribution, expectedOrdering);
}
use of org.apache.spark.sql.connector.distributions.Distribution in project iceberg by apache.
the class TestSparkDistributionAndOrderingUtil method testHashCopyOnWriteMergePartitionedSortedTable.
@Test
public void testHashCopyOnWriteMergePartitionedSortedTable() {
sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + "USING iceberg " + "PARTITIONED BY (date, bucket(8, data))", tableName);
Table table = validationCatalog.loadTable(tableIdent);
table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit();
table.replaceSortOrder().asc("id").commit();
Expression[] expectedClustering = new Expression[] { Expressions.identity("date"), Expressions.bucket(8, "data") };
Distribution expectedDistribution = Distributions.clustered(expectedClustering);
SortOrder[] expectedOrdering = new SortOrder[] { Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), Expressions.sort(Expressions.bucket(8, "data"), SortDirection.ASCENDING), Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) };
checkCopyOnWriteDistributionAndOrdering(table, MERGE, expectedDistribution, expectedOrdering);
}
Aggregations