use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.
the class TestRequiredDistributionAndOrdering method testDisabledDistributionAndOrdering.
public void testDisabledDistributionAndOrdering() {
sql("CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + "USING iceberg " + "PARTITIONED BY (bucket(2, c1))", tableName);
List<ThreeColumnRecord> data = ImmutableList.of(new ThreeColumnRecord(1, null, "A"), new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), new ThreeColumnRecord(7, "BBBBBBBBBB", "A"));
Dataset<Row> ds = spark.createDataFrame(data, ThreeColumnRecord.class);
Dataset<Row> inputDF = ds.coalesce(1).sortWithinPartitions("c1");
// should fail if ordering is disabled
AssertHelpers.assertThrows("Should reject writes without ordering", SparkException.class, "Writing job aborted", () -> {
try {
inputDF.writeTo(tableName).option(SparkWriteOptions.USE_TABLE_DISTRIBUTION_AND_ORDERING, "false").append();
} catch (NoSuchTableException e) {
throw new RuntimeException(e);
the class TestRequiredDistributionAndOrdering method testDefaultLocalSortWithBucketTransforms.
public void testDefaultLocalSortWithBucketTransforms() throws NoSuchTableException {
sql("CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + "USING iceberg " + "PARTITIONED BY (bucket(2, c1))", tableName);
List<ThreeColumnRecord> data = ImmutableList.of(new ThreeColumnRecord(1, null, "A"), new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), new ThreeColumnRecord(7, "BBBBBBBBBB", "A"));
Dataset<Row> ds = spark.createDataFrame(data, ThreeColumnRecord.class);
Dataset<Row> inputDF = ds.coalesce(1).sortWithinPartitions("c1");
// should insert a local sort by partition columns by default
assertEquals("Row count must match", ImmutableList.of(row(7L)), sql("SELECT count(*) FROM %s", tableName));
the class TestRequiredDistributionAndOrdering method testSortOrderIncludesPartitionColumns.
public void testSortOrderIncludesPartitionColumns() throws NoSuchTableException {
sql("CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + "USING iceberg " + "PARTITIONED BY (bucket(2, c1))", tableName);
List<ThreeColumnRecord> data = ImmutableList.of(new ThreeColumnRecord(1, null, "A"), new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), new ThreeColumnRecord(7, "BBBBBBBBBB", "A"));
Dataset<Row> ds = spark.createDataFrame(data, ThreeColumnRecord.class);
Dataset<Row> inputDF = ds.coalesce(1).sortWithinPartitions("c1");
// should succeed with a correct sort order
sql("ALTER TABLE %s WRITE ORDERED BY bucket(2, c3), c1, c2", tableName);
assertEquals("Row count must match", ImmutableList.of(row(7L)), sql("SELECT count(*) FROM %s", tableName));
the class TestRequiredDistributionAndOrdering method testHashDistributionOnBucketedColumn.
public void testHashDistributionOnBucketedColumn() throws NoSuchTableException {
sql("CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + "USING iceberg " + "PARTITIONED BY (bucket(2, c1))", tableName);
List<ThreeColumnRecord> data = ImmutableList.of(new ThreeColumnRecord(1, null, "A"), new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), new ThreeColumnRecord(7, "BBBBBBBBBB", "A"));
Dataset<Row> ds = spark.createDataFrame(data, ThreeColumnRecord.class);
Dataset<Row> inputDF = ds.coalesce(1).sortWithinPartitions("c1");
// should automatically prepend partition columns to the local ordering after hash distribution
assertEquals("Row count must match", ImmutableList.of(row(7L)), sql("SELECT count(*) FROM %s", tableName));
the class TestRewriteDataFilesAction method testRewriteLargeTableHasResiduals.
public void testRewriteLargeTableHasResiduals() {
PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).build();
Map<String, String> options = Maps.newHashMap();
options.put(TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES, "100");
Table table = TABLES.create(SCHEMA, spec, options, tableLocation);
// all records belong to the same partition
List<ThreeColumnRecord> records = Lists.newArrayList();
for (int i = 0; i < 100; i++) {
records.add(new ThreeColumnRecord(i, String.valueOf(i), String.valueOf(i % 4)));
Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.class);
List<Object[]> expectedRecords = currentData();
CloseableIterable<FileScanTask> tasks = table.newScan().ignoreResiduals().filter(Expressions.equal("c3", "0")).planFiles();
for (FileScanTask task : tasks) {
Assert.assertEquals("Residuals must be ignored", Expressions.alwaysTrue(), task.residual());
shouldHaveFiles(table, 2);
Result result = basicRewrite(table).filter(Expressions.equal("c3", "0")).execute();
Assert.assertEquals("Action should rewrite 2 data files", 2, result.rewrittenDataFilesCount());
Assert.assertEquals("Action should add 1 data file", 1, result.addedDataFilesCount());
List<Object[]> actualRecords = currentData();
assertEquals("Rows must match", expectedRecords, actualRecords);