Search in sources :

Example 1 with Distribution

use of org.apache.spark.sql.connector.distributions.Distribution in project iceberg by apache.

the class SparkDistributionAndOrderingUtil method buildPositionMergeDistribution.

private static Distribution buildPositionMergeDistribution(Table table, DistributionMode distributionMode) {
    switch(distributionMode) {
        case NONE:
            return Distributions.unspecified();
        case HASH:
            if (table.spec().isUnpartitioned()) {
                Expression[] clustering = new Expression[] { SPEC_ID, PARTITION, FILE_PATH };
                return Distributions.clustered(clustering);
            } else {
                Distribution dataDistribution = buildRequiredDistribution(table, distributionMode);
                Expression[] dataClustering = ((ClusteredDistribution) dataDistribution).clustering();
                Expression[] deleteClustering = new Expression[] { SPEC_ID, PARTITION };
                Expression[] clustering = ObjectArrays.concat(deleteClustering, dataClustering, Expression.class);
                return Distributions.clustered(clustering);
            }
        case RANGE:
            Distribution dataDistribution = buildRequiredDistribution(table, distributionMode);
            SortOrder[] dataOrdering = ((OrderedDistribution) dataDistribution).ordering();
            SortOrder[] deleteOrdering = new SortOrder[] { SPEC_ID_ORDER, PARTITION_ORDER, FILE_PATH_ORDER };
            SortOrder[] ordering = ObjectArrays.concat(deleteOrdering, dataOrdering, SortOrder.class);
            return Distributions.ordered(ordering);
        default:
            throw new IllegalArgumentException("Unexpected distribution mode: " + distributionMode);
    }
}
Also used : Expression(org.apache.spark.sql.connector.expressions.Expression) ClusteredDistribution(org.apache.spark.sql.connector.distributions.ClusteredDistribution) Distribution(org.apache.spark.sql.connector.distributions.Distribution) UnspecifiedDistribution(org.apache.spark.sql.connector.distributions.UnspecifiedDistribution) OrderedDistribution(org.apache.spark.sql.connector.distributions.OrderedDistribution) SortOrder(org.apache.spark.sql.connector.expressions.SortOrder) OrderedDistribution(org.apache.spark.sql.connector.distributions.OrderedDistribution) ClusteredDistribution(org.apache.spark.sql.connector.distributions.ClusteredDistribution)

Example 2 with Distribution

use of org.apache.spark.sql.connector.distributions.Distribution in project iceberg by apache.

the class TestSparkDistributionAndOrderingUtil method testRangeCopyOnWriteDeleteUnpartitionedSortedTable.

@Test
public void testRangeCopyOnWriteDeleteUnpartitionedSortedTable() {
    sql("CREATE TABLE %s (id bigint, data string) USING iceberg", tableName);
    Table table = validationCatalog.loadTable(tableIdent);
    table.updateProperties().set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit();
    table.replaceSortOrder().asc("id").asc("data").commit();
    SortOrder[] expectedOrdering = new SortOrder[] { Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING), Expressions.sort(Expressions.column("data"), SortDirection.ASCENDING) };
    Distribution expectedDistribution = Distributions.ordered(expectedOrdering);
    checkCopyOnWriteDistributionAndOrdering(table, DELETE, expectedDistribution, expectedOrdering);
}
Also used : Table(org.apache.iceberg.Table) Distribution(org.apache.spark.sql.connector.distributions.Distribution) SortOrder(org.apache.spark.sql.connector.expressions.SortOrder) Test(org.junit.Test)

Example 3 with Distribution

use of org.apache.spark.sql.connector.distributions.Distribution in project iceberg by apache.

the class TestSparkDistributionAndOrderingUtil method testRangeCopyOnWriteDeleteUnpartitionedUnsortedTable.

@Test
public void testRangeCopyOnWriteDeleteUnpartitionedUnsortedTable() {
    sql("CREATE TABLE %s (id bigint, data string) USING iceberg", tableName);
    Table table = validationCatalog.loadTable(tableIdent);
    table.updateProperties().set(DELETE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_RANGE).commit();
    Distribution expectedDistribution = Distributions.ordered(FILE_POSITION_ORDERING);
    checkCopyOnWriteDistributionAndOrdering(table, DELETE, expectedDistribution, FILE_POSITION_ORDERING);
}
Also used : Table(org.apache.iceberg.Table) Distribution(org.apache.spark.sql.connector.distributions.Distribution) Test(org.junit.Test)

Example 4 with Distribution

use of org.apache.spark.sql.connector.distributions.Distribution in project iceberg by apache.

the class TestSparkDistributionAndOrderingUtil method testHashWritePartitionedSortedTable.

@Test
public void testHashWritePartitionedSortedTable() {
    sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + "USING iceberg " + "PARTITIONED BY (date, bucket(8, data))", tableName);
    Table table = validationCatalog.loadTable(tableIdent);
    table.updateProperties().set(WRITE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit();
    table.replaceSortOrder().asc("id").commit();
    Expression[] expectedClustering = new Expression[] { Expressions.identity("date"), Expressions.bucket(8, "data") };
    Distribution expectedDistribution = Distributions.clustered(expectedClustering);
    SortOrder[] expectedOrdering = new SortOrder[] { Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), Expressions.sort(Expressions.bucket(8, "data"), SortDirection.ASCENDING), Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) };
    checkWriteDistributionAndOrdering(table, expectedDistribution, expectedOrdering);
}
Also used : Table(org.apache.iceberg.Table) Expression(org.apache.spark.sql.connector.expressions.Expression) Distribution(org.apache.spark.sql.connector.distributions.Distribution) SortOrder(org.apache.spark.sql.connector.expressions.SortOrder) Test(org.junit.Test)

Example 5 with Distribution

use of org.apache.spark.sql.connector.distributions.Distribution in project iceberg by apache.

the class TestSparkDistributionAndOrderingUtil method testHashCopyOnWriteMergePartitionedSortedTable.

@Test
public void testHashCopyOnWriteMergePartitionedSortedTable() {
    sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + "USING iceberg " + "PARTITIONED BY (date, bucket(8, data))", tableName);
    Table table = validationCatalog.loadTable(tableIdent);
    table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit();
    table.replaceSortOrder().asc("id").commit();
    Expression[] expectedClustering = new Expression[] { Expressions.identity("date"), Expressions.bucket(8, "data") };
    Distribution expectedDistribution = Distributions.clustered(expectedClustering);
    SortOrder[] expectedOrdering = new SortOrder[] { Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), Expressions.sort(Expressions.bucket(8, "data"), SortDirection.ASCENDING), Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) };
    checkCopyOnWriteDistributionAndOrdering(table, MERGE, expectedDistribution, expectedOrdering);
}
Also used : Table(org.apache.iceberg.Table) Expression(org.apache.spark.sql.connector.expressions.Expression) Distribution(org.apache.spark.sql.connector.distributions.Distribution) SortOrder(org.apache.spark.sql.connector.expressions.SortOrder) Test(org.junit.Test)

Aggregations

Distribution (org.apache.spark.sql.connector.distributions.Distribution)39 Table (org.apache.iceberg.Table)32 SortOrder (org.apache.spark.sql.connector.expressions.SortOrder)32 Test (org.junit.Test)32 Expression (org.apache.spark.sql.connector.expressions.Expression)9 DistributionMode (org.apache.iceberg.DistributionMode)3 Schema (org.apache.iceberg.Schema)2 UpdateSchema (org.apache.iceberg.UpdateSchema)1 NestedField (org.apache.iceberg.types.Types.NestedField)1 Dataset (org.apache.spark.sql.Dataset)1 Row (org.apache.spark.sql.Row)1 SparkSession (org.apache.spark.sql.SparkSession)1 LogicalPlan (org.apache.spark.sql.catalyst.plans.logical.LogicalPlan)1 ClusteredDistribution (org.apache.spark.sql.connector.distributions.ClusteredDistribution)1 OrderedDistribution (org.apache.spark.sql.connector.distributions.OrderedDistribution)1 UnspecifiedDistribution (org.apache.spark.sql.connector.distributions.UnspecifiedDistribution)1 SQLConf (org.apache.spark.sql.internal.SQLConf)1