Search in sources :

Example 1 with Expression

use of org.apache.spark.sql.connector.expressions.Expression in project iceberg by apache.

the class Spark3Util method findWidth.

@SuppressWarnings("unchecked")
private static int findWidth(Transform transform) {
    for (Expression expr : transform.arguments()) {
        if (expr instanceof Literal) {
            if (((Literal) expr).dataType() instanceof IntegerType) {
                Literal<Integer> lit = (Literal<Integer>) expr;
                Preconditions.checkArgument(lit.value() > 0, "Unsupported width for transform: %s", transform.describe());
                return lit.value();
            } else if (((Literal) expr).dataType() instanceof LongType) {
                Literal<Long> lit = (Literal<Long>) expr;
                Preconditions.checkArgument(lit.value() > 0 && lit.value() < Integer.MAX_VALUE, "Unsupported width for transform: %s", transform.describe());
                if (lit.value() > Integer.MAX_VALUE) {
                    throw new IllegalArgumentException();
                }
                return lit.value().intValue();
            }
        }
    }
    throw new IllegalArgumentException("Cannot find width for transform: " + transform.describe());
}
Also used : IntegerType(org.apache.spark.sql.types.IntegerType) LongType(org.apache.spark.sql.types.LongType) Expression(org.apache.spark.sql.connector.expressions.Expression) Literal(org.apache.spark.sql.connector.expressions.Literal)

Example 2 with Expression

use of org.apache.spark.sql.connector.expressions.Expression in project iceberg by apache.

the class SparkDistributionAndOrderingUtil method buildPositionMergeDistribution.

private static Distribution buildPositionMergeDistribution(Table table, DistributionMode distributionMode) {
    switch(distributionMode) {
        case NONE:
            return Distributions.unspecified();
        case HASH:
            if (table.spec().isUnpartitioned()) {
                Expression[] clustering = new Expression[] { SPEC_ID, PARTITION, FILE_PATH };
                return Distributions.clustered(clustering);
            } else {
                Distribution dataDistribution = buildRequiredDistribution(table, distributionMode);
                Expression[] dataClustering = ((ClusteredDistribution) dataDistribution).clustering();
                Expression[] deleteClustering = new Expression[] { SPEC_ID, PARTITION };
                Expression[] clustering = ObjectArrays.concat(deleteClustering, dataClustering, Expression.class);
                return Distributions.clustered(clustering);
            }
        case RANGE:
            Distribution dataDistribution = buildRequiredDistribution(table, distributionMode);
            SortOrder[] dataOrdering = ((OrderedDistribution) dataDistribution).ordering();
            SortOrder[] deleteOrdering = new SortOrder[] { SPEC_ID_ORDER, PARTITION_ORDER, FILE_PATH_ORDER };
            SortOrder[] ordering = ObjectArrays.concat(deleteOrdering, dataOrdering, SortOrder.class);
            return Distributions.ordered(ordering);
        default:
            throw new IllegalArgumentException("Unexpected distribution mode: " + distributionMode);
    }
}
Also used : Expression(org.apache.spark.sql.connector.expressions.Expression) ClusteredDistribution(org.apache.spark.sql.connector.distributions.ClusteredDistribution) Distribution(org.apache.spark.sql.connector.distributions.Distribution) UnspecifiedDistribution(org.apache.spark.sql.connector.distributions.UnspecifiedDistribution) OrderedDistribution(org.apache.spark.sql.connector.distributions.OrderedDistribution) SortOrder(org.apache.spark.sql.connector.expressions.SortOrder) OrderedDistribution(org.apache.spark.sql.connector.distributions.OrderedDistribution) ClusteredDistribution(org.apache.spark.sql.connector.distributions.ClusteredDistribution)

Example 3 with Expression

use of org.apache.spark.sql.connector.expressions.Expression in project iceberg by apache.

the class TestSparkDistributionAndOrderingUtil method testHashWritePartitionedSortedTable.

@Test
public void testHashWritePartitionedSortedTable() {
    sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + "USING iceberg " + "PARTITIONED BY (date, bucket(8, data))", tableName);
    Table table = validationCatalog.loadTable(tableIdent);
    table.updateProperties().set(WRITE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit();
    table.replaceSortOrder().asc("id").commit();
    Expression[] expectedClustering = new Expression[] { Expressions.identity("date"), Expressions.bucket(8, "data") };
    Distribution expectedDistribution = Distributions.clustered(expectedClustering);
    SortOrder[] expectedOrdering = new SortOrder[] { Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), Expressions.sort(Expressions.bucket(8, "data"), SortDirection.ASCENDING), Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) };
    checkWriteDistributionAndOrdering(table, expectedDistribution, expectedOrdering);
}
Also used : Table(org.apache.iceberg.Table) Expression(org.apache.spark.sql.connector.expressions.Expression) Distribution(org.apache.spark.sql.connector.distributions.Distribution) SortOrder(org.apache.spark.sql.connector.expressions.SortOrder) Test(org.junit.Test)

Example 4 with Expression

use of org.apache.spark.sql.connector.expressions.Expression in project iceberg by apache.

the class TestSparkDistributionAndOrderingUtil method testHashCopyOnWriteMergePartitionedSortedTable.

@Test
public void testHashCopyOnWriteMergePartitionedSortedTable() {
    sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + "USING iceberg " + "PARTITIONED BY (date, bucket(8, data))", tableName);
    Table table = validationCatalog.loadTable(tableIdent);
    table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit();
    table.replaceSortOrder().asc("id").commit();
    Expression[] expectedClustering = new Expression[] { Expressions.identity("date"), Expressions.bucket(8, "data") };
    Distribution expectedDistribution = Distributions.clustered(expectedClustering);
    SortOrder[] expectedOrdering = new SortOrder[] { Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), Expressions.sort(Expressions.bucket(8, "data"), SortDirection.ASCENDING), Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) };
    checkCopyOnWriteDistributionAndOrdering(table, MERGE, expectedDistribution, expectedOrdering);
}
Also used : Table(org.apache.iceberg.Table) Expression(org.apache.spark.sql.connector.expressions.Expression) Distribution(org.apache.spark.sql.connector.distributions.Distribution) SortOrder(org.apache.spark.sql.connector.expressions.SortOrder) Test(org.junit.Test)

Example 5 with Expression

use of org.apache.spark.sql.connector.expressions.Expression in project iceberg by apache.

the class TestSparkDistributionAndOrderingUtil method testHashCopyOnWriteMergePartitionedUnsortedTable.

@Test
public void testHashCopyOnWriteMergePartitionedUnsortedTable() {
    sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + "USING iceberg " + "PARTITIONED BY (date, days(ts))", tableName);
    Table table = validationCatalog.loadTable(tableIdent);
    table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit();
    Expression[] expectedClustering = new Expression[] { Expressions.identity("date"), Expressions.days("ts") };
    Distribution expectedDistribution = Distributions.clustered(expectedClustering);
    SortOrder[] expectedOrdering = new SortOrder[] { Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) };
    checkCopyOnWriteDistributionAndOrdering(table, MERGE, expectedDistribution, expectedOrdering);
}
Also used : Table(org.apache.iceberg.Table) Expression(org.apache.spark.sql.connector.expressions.Expression) Distribution(org.apache.spark.sql.connector.distributions.Distribution) SortOrder(org.apache.spark.sql.connector.expressions.SortOrder) Test(org.junit.Test)

Aggregations

Expression (org.apache.spark.sql.connector.expressions.Expression)10 Distribution (org.apache.spark.sql.connector.distributions.Distribution)9 Table (org.apache.iceberg.Table)8 SortOrder (org.apache.spark.sql.connector.expressions.SortOrder)8 Test (org.junit.Test)8 ClusteredDistribution (org.apache.spark.sql.connector.distributions.ClusteredDistribution)1 OrderedDistribution (org.apache.spark.sql.connector.distributions.OrderedDistribution)1 UnspecifiedDistribution (org.apache.spark.sql.connector.distributions.UnspecifiedDistribution)1 Literal (org.apache.spark.sql.connector.expressions.Literal)1 IntegerType (org.apache.spark.sql.types.IntegerType)1 LongType (org.apache.spark.sql.types.LongType)1