use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.
the class TestRewriteDataFilesAction method testRewriteDataFilesUnpartitionedTable.
@Test
public void testRewriteDataFilesUnpartitionedTable() {
PartitionSpec spec = PartitionSpec.unpartitioned();
Map<String, String> options = Maps.newHashMap();
Table table = TABLES.create(SCHEMA, spec, options, tableLocation);
List<ThreeColumnRecord> records1 = Lists.newArrayList(new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"));
writeRecords(records1);
List<ThreeColumnRecord> records2 = Lists.newArrayList(new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD"));
writeRecords(records2);
table.refresh();
CloseableIterable<FileScanTask> tasks = table.newScan().planFiles();
List<DataFile> dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file));
Assert.assertEquals("Should have 4 data files before rewrite", 4, dataFiles.size());
Actions actions = Actions.forTable(table);
RewriteDataFilesActionResult result = actions.rewriteDataFiles().execute();
Assert.assertEquals("Action should rewrite 4 data files", 4, result.deletedDataFiles().size());
Assert.assertEquals("Action should add 1 data file", 1, result.addedDataFiles().size());
table.refresh();
CloseableIterable<FileScanTask> tasks1 = table.newScan().planFiles();
List<DataFile> dataFiles1 = Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file));
Assert.assertEquals("Should have 1 data files before rewrite", 1, dataFiles1.size());
List<ThreeColumnRecord> expectedRecords = Lists.newArrayList();
expectedRecords.addAll(records1);
expectedRecords.addAll(records2);
Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
List<ThreeColumnRecord> actualRecords = resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList();
Assert.assertEquals("Rows must match", expectedRecords, actualRecords);
}
use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.
the class TestRewriteDataFilesAction method testRewriteDataFilesPartitionedTable.
@Test
public void testRewriteDataFilesPartitionedTable() {
PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c1").truncate("c2", 2).build();
Map<String, String> options = Maps.newHashMap();
Table table = TABLES.create(SCHEMA, spec, options, tableLocation);
List<ThreeColumnRecord> records1 = Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"), new ThreeColumnRecord(1, "AAAAAAAAAA", "CCCC"));
writeRecords(records1);
List<ThreeColumnRecord> records2 = Lists.newArrayList(new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"), new ThreeColumnRecord(1, "BBBBBBBBBB", "DDDD"));
writeRecords(records2);
List<ThreeColumnRecord> records3 = Lists.newArrayList(new ThreeColumnRecord(2, "AAAAAAAAAA", "EEEE"), new ThreeColumnRecord(2, "AAAAAAAAAA", "GGGG"));
writeRecords(records3);
List<ThreeColumnRecord> records4 = Lists.newArrayList(new ThreeColumnRecord(2, "BBBBBBBBBB", "FFFF"), new ThreeColumnRecord(2, "BBBBBBBBBB", "HHHH"));
writeRecords(records4);
table.refresh();
CloseableIterable<FileScanTask> tasks = table.newScan().planFiles();
List<DataFile> dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file));
Assert.assertEquals("Should have 8 data files before rewrite", 8, dataFiles.size());
Actions actions = Actions.forTable(table);
RewriteDataFilesActionResult result = actions.rewriteDataFiles().execute();
Assert.assertEquals("Action should rewrite 8 data files", 8, result.deletedDataFiles().size());
Assert.assertEquals("Action should add 4 data file", 4, result.addedDataFiles().size());
table.refresh();
CloseableIterable<FileScanTask> tasks1 = table.newScan().planFiles();
List<DataFile> dataFiles1 = Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file));
Assert.assertEquals("Should have 4 data files before rewrite", 4, dataFiles1.size());
List<ThreeColumnRecord> expectedRecords = Lists.newArrayList();
expectedRecords.addAll(records1);
expectedRecords.addAll(records2);
expectedRecords.addAll(records3);
expectedRecords.addAll(records4);
Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
List<ThreeColumnRecord> actualRecords = resultDF.sort("c1", "c2", "c3").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList();
Assert.assertEquals("Rows must match", expectedRecords, actualRecords);
}
use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.
the class TestRewriteDataFilesAction method testRewriteDataFilesWithFilter.
@Test
public void testRewriteDataFilesWithFilter() {
PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c1").truncate("c2", 2).build();
Map<String, String> options = Maps.newHashMap();
Table table = TABLES.create(SCHEMA, spec, options, tableLocation);
List<ThreeColumnRecord> records1 = Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"), new ThreeColumnRecord(1, "AAAAAAAAAA", "CCCC"));
writeRecords(records1);
List<ThreeColumnRecord> records2 = Lists.newArrayList(new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"), new ThreeColumnRecord(1, "BBBBBBBBBB", "DDDD"));
writeRecords(records2);
List<ThreeColumnRecord> records3 = Lists.newArrayList(new ThreeColumnRecord(2, "AAAAAAAAAA", "EEEE"), new ThreeColumnRecord(2, "AAAAAAAAAA", "GGGG"));
writeRecords(records3);
List<ThreeColumnRecord> records4 = Lists.newArrayList(new ThreeColumnRecord(2, "BBBBBBBBBB", "FFFF"), new ThreeColumnRecord(2, "BBBBBBBBBB", "HHHH"));
writeRecords(records4);
table.refresh();
CloseableIterable<FileScanTask> tasks = table.newScan().planFiles();
List<DataFile> dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file));
Assert.assertEquals("Should have 8 data files before rewrite", 8, dataFiles.size());
Actions actions = Actions.forTable(table);
RewriteDataFilesActionResult result = actions.rewriteDataFiles().filter(Expressions.equal("c1", 1)).filter(Expressions.startsWith("c2", "AA")).execute();
Assert.assertEquals("Action should rewrite 2 data files", 2, result.deletedDataFiles().size());
Assert.assertEquals("Action should add 1 data file", 1, result.addedDataFiles().size());
table.refresh();
CloseableIterable<FileScanTask> tasks1 = table.newScan().planFiles();
List<DataFile> dataFiles1 = Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file));
Assert.assertEquals("Should have 7 data files before rewrite", 7, dataFiles1.size());
List<ThreeColumnRecord> expectedRecords = Lists.newArrayList();
expectedRecords.addAll(records1);
expectedRecords.addAll(records2);
expectedRecords.addAll(records3);
expectedRecords.addAll(records4);
Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
List<ThreeColumnRecord> actualRecords = resultDF.sort("c1", "c2", "c3").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList();
Assert.assertEquals("Rows must match", expectedRecords, actualRecords);
}
use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.
the class TestRequiredDistributionAndOrdering method testRangeDistributionWithQuotedColumnNames.
@Test
public void testRangeDistributionWithQuotedColumnNames() throws NoSuchTableException {
sql("CREATE TABLE %s (`c.1` INT, c2 STRING, c3 STRING) " + "USING iceberg " + "PARTITIONED BY (bucket(2, `c.1`))", tableName);
List<ThreeColumnRecord> data = ImmutableList.of(new ThreeColumnRecord(1, null, "A"), new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), new ThreeColumnRecord(7, "BBBBBBBBBB", "A"));
Dataset<Row> ds = spark.createDataFrame(data, ThreeColumnRecord.class);
Dataset<Row> inputDF = ds.selectExpr("c1 as `c.1`", "c2", "c3").coalesce(1).sortWithinPartitions("`c.1`");
sql("ALTER TABLE %s WRITE ORDERED BY `c.1`, c2", tableName);
inputDF.writeTo(tableName).append();
assertEquals("Row count must match", ImmutableList.of(row(7L)), sql("SELECT count(*) FROM %s", tableName));
}
use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.
the class TestRequiredDistributionAndOrdering method testPartitionColumnsArePrependedForRangeDistribution.
@Test
public void testPartitionColumnsArePrependedForRangeDistribution() throws NoSuchTableException {
sql("CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + "USING iceberg " + "PARTITIONED BY (bucket(2, c1))", tableName);
List<ThreeColumnRecord> data = ImmutableList.of(new ThreeColumnRecord(1, null, "A"), new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), new ThreeColumnRecord(7, "BBBBBBBBBB", "A"));
Dataset<Row> ds = spark.createDataFrame(data, ThreeColumnRecord.class);
Dataset<Row> inputDF = ds.coalesce(1).sortWithinPartitions("c1");
// should automatically prepend partition columns to the ordering
sql("ALTER TABLE %s WRITE ORDERED BY c1, c2", tableName);
inputDF.writeTo(tableName).append();
assertEquals("Row count must match", ImmutableList.of(row(7L)), sql("SELECT count(*) FROM %s", tableName));
}
Aggregations