use of org.apache.spark.sql.connector.expressions.Expression in project iceberg by apache.
the class Spark3Util method findWidth.
@SuppressWarnings("unchecked")
private static int findWidth(Transform transform) {
for (Expression expr : transform.arguments()) {
if (expr instanceof Literal) {
if (((Literal) expr).dataType() instanceof IntegerType) {
Literal<Integer> lit = (Literal<Integer>) expr;
Preconditions.checkArgument(lit.value() > 0, "Unsupported width for transform: %s", transform.describe());
return lit.value();
} else if (((Literal) expr).dataType() instanceof LongType) {
Literal<Long> lit = (Literal<Long>) expr;
Preconditions.checkArgument(lit.value() > 0 && lit.value() < Integer.MAX_VALUE, "Unsupported width for transform: %s", transform.describe());
if (lit.value() > Integer.MAX_VALUE) {
throw new IllegalArgumentException();
}
return lit.value().intValue();
}
}
}
throw new IllegalArgumentException("Cannot find width for transform: " + transform.describe());
}
use of org.apache.spark.sql.connector.expressions.Expression in project iceberg by apache.
the class SparkDistributionAndOrderingUtil method buildPositionMergeDistribution.
private static Distribution buildPositionMergeDistribution(Table table, DistributionMode distributionMode) {
switch(distributionMode) {
case NONE:
return Distributions.unspecified();
case HASH:
if (table.spec().isUnpartitioned()) {
Expression[] clustering = new Expression[] { SPEC_ID, PARTITION, FILE_PATH };
return Distributions.clustered(clustering);
} else {
Distribution dataDistribution = buildRequiredDistribution(table, distributionMode);
Expression[] dataClustering = ((ClusteredDistribution) dataDistribution).clustering();
Expression[] deleteClustering = new Expression[] { SPEC_ID, PARTITION };
Expression[] clustering = ObjectArrays.concat(deleteClustering, dataClustering, Expression.class);
return Distributions.clustered(clustering);
}
case RANGE:
Distribution dataDistribution = buildRequiredDistribution(table, distributionMode);
SortOrder[] dataOrdering = ((OrderedDistribution) dataDistribution).ordering();
SortOrder[] deleteOrdering = new SortOrder[] { SPEC_ID_ORDER, PARTITION_ORDER, FILE_PATH_ORDER };
SortOrder[] ordering = ObjectArrays.concat(deleteOrdering, dataOrdering, SortOrder.class);
return Distributions.ordered(ordering);
default:
throw new IllegalArgumentException("Unexpected distribution mode: " + distributionMode);
}
}
use of org.apache.spark.sql.connector.expressions.Expression in project iceberg by apache.
the class TestSparkDistributionAndOrderingUtil method testHashWritePartitionedSortedTable.
@Test
public void testHashWritePartitionedSortedTable() {
sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + "USING iceberg " + "PARTITIONED BY (date, bucket(8, data))", tableName);
Table table = validationCatalog.loadTable(tableIdent);
table.updateProperties().set(WRITE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit();
table.replaceSortOrder().asc("id").commit();
Expression[] expectedClustering = new Expression[] { Expressions.identity("date"), Expressions.bucket(8, "data") };
Distribution expectedDistribution = Distributions.clustered(expectedClustering);
SortOrder[] expectedOrdering = new SortOrder[] { Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), Expressions.sort(Expressions.bucket(8, "data"), SortDirection.ASCENDING), Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) };
checkWriteDistributionAndOrdering(table, expectedDistribution, expectedOrdering);
}
use of org.apache.spark.sql.connector.expressions.Expression in project iceberg by apache.
the class TestSparkDistributionAndOrderingUtil method testHashCopyOnWriteMergePartitionedSortedTable.
@Test
public void testHashCopyOnWriteMergePartitionedSortedTable() {
sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + "USING iceberg " + "PARTITIONED BY (date, bucket(8, data))", tableName);
Table table = validationCatalog.loadTable(tableIdent);
table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit();
table.replaceSortOrder().asc("id").commit();
Expression[] expectedClustering = new Expression[] { Expressions.identity("date"), Expressions.bucket(8, "data") };
Distribution expectedDistribution = Distributions.clustered(expectedClustering);
SortOrder[] expectedOrdering = new SortOrder[] { Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), Expressions.sort(Expressions.bucket(8, "data"), SortDirection.ASCENDING), Expressions.sort(Expressions.column("id"), SortDirection.ASCENDING) };
checkCopyOnWriteDistributionAndOrdering(table, MERGE, expectedDistribution, expectedOrdering);
}
use of org.apache.spark.sql.connector.expressions.Expression in project iceberg by apache.
the class TestSparkDistributionAndOrderingUtil method testHashCopyOnWriteMergePartitionedUnsortedTable.
@Test
public void testHashCopyOnWriteMergePartitionedUnsortedTable() {
sql("CREATE TABLE %s (id BIGINT, data STRING, date DATE, ts TIMESTAMP) " + "USING iceberg " + "PARTITIONED BY (date, days(ts))", tableName);
Table table = validationCatalog.loadTable(tableIdent);
table.updateProperties().set(MERGE_DISTRIBUTION_MODE, WRITE_DISTRIBUTION_MODE_HASH).commit();
Expression[] expectedClustering = new Expression[] { Expressions.identity("date"), Expressions.days("ts") };
Distribution expectedDistribution = Distributions.clustered(expectedClustering);
SortOrder[] expectedOrdering = new SortOrder[] { Expressions.sort(Expressions.column("date"), SortDirection.ASCENDING), Expressions.sort(Expressions.days("ts"), SortDirection.ASCENDING) };
checkCopyOnWriteDistributionAndOrdering(table, MERGE, expectedDistribution, expectedOrdering);
}
Aggregations