use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.
the class TestRewriteManifestsAction method testRewriteManifestsWithPredicate.
@Test
public void testRewriteManifestsWithPredicate() throws IOException {
PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c1").truncate("c2", 2).build();
Map<String, String> options = Maps.newHashMap();
options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled);
Table table = TABLES.create(SCHEMA, spec, options, tableLocation);
List<ThreeColumnRecord> records1 = Lists.newArrayList(new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"));
writeRecords(records1);
List<ThreeColumnRecord> records2 = Lists.newArrayList(new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD"));
writeRecords(records2);
table.refresh();
List<ManifestFile> manifests = table.currentSnapshot().allManifests();
Assert.assertEquals("Should have 2 manifests before rewrite", 2, manifests.size());
SparkActions actions = SparkActions.get();
// rewrite only the first manifest without caching
RewriteManifests.Result result = actions.rewriteManifests(table).rewriteIf(manifest -> manifest.path().equals(manifests.get(0).path())).stagingLocation(temp.newFolder().toString()).option("use-caching", "false").execute();
Assert.assertEquals("Action should rewrite 1 manifest", 1, Iterables.size(result.rewrittenManifests()));
Assert.assertEquals("Action should add 1 manifests", 1, Iterables.size(result.addedManifests()));
table.refresh();
List<ManifestFile> newManifests = table.currentSnapshot().allManifests();
Assert.assertEquals("Should have 2 manifests after rewrite", 2, newManifests.size());
Assert.assertFalse("First manifest must be rewritten", newManifests.contains(manifests.get(0)));
Assert.assertTrue("Second manifest must not be rewritten", newManifests.contains(manifests.get(1)));
List<ThreeColumnRecord> expectedRecords = Lists.newArrayList();
expectedRecords.addAll(records1);
expectedRecords.addAll(records2);
Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
List<ThreeColumnRecord> actualRecords = resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList();
Assert.assertEquals("Rows must match", expectedRecords, actualRecords);
}
use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.
the class TestRewriteDataFilesAction method testRewriteDataFilesForLargeFile.
@Test
public void testRewriteDataFilesForLargeFile() throws AnalysisException {
PartitionSpec spec = PartitionSpec.unpartitioned();
Map<String, String> options = Maps.newHashMap();
Table table = TABLES.create(SCHEMA, spec, options, tableLocation);
Assert.assertNull("Table must be empty", table.currentSnapshot());
List<ThreeColumnRecord> records1 = Lists.newArrayList();
IntStream.range(0, 2000).forEach(i -> records1.add(new ThreeColumnRecord(i, "foo" + i, "bar" + i)));
Dataset<Row> df = spark.createDataFrame(records1, ThreeColumnRecord.class).repartition(1);
writeDF(df);
List<ThreeColumnRecord> records2 = Lists.newArrayList(new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"), new ThreeColumnRecord(1, "DDDDDDDDDD", "DDDD"));
writeRecords(records2);
table.refresh();
CloseableIterable<FileScanTask> tasks = table.newScan().planFiles();
List<DataFile> dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file));
DataFile maxSizeFile = Collections.max(dataFiles, Comparator.comparingLong(DataFile::fileSizeInBytes));
Assert.assertEquals("Should have 3 files before rewrite", 3, dataFiles.size());
spark.read().format("iceberg").load(tableLocation).createTempView("origin");
long originalNumRecords = spark.read().format("iceberg").load(tableLocation).count();
List<Object[]> originalRecords = sql("SELECT * from origin sort by c2");
Actions actions = Actions.forTable(table);
long targetSizeInBytes = maxSizeFile.fileSizeInBytes() - 10;
RewriteDataFilesActionResult result = actions.rewriteDataFiles().targetSizeInBytes(targetSizeInBytes).splitOpenFileCost(1).execute();
Assert.assertEquals("Action should delete 3 data files", 3, result.deletedDataFiles().size());
Assert.assertEquals("Action should add 2 data files", 2, result.addedDataFiles().size());
spark.read().format("iceberg").load(tableLocation).createTempView("postRewrite");
long postRewriteNumRecords = spark.read().format("iceberg").load(tableLocation).count();
List<Object[]> rewrittenRecords = sql("SELECT * from postRewrite sort by c2");
Assert.assertEquals(originalNumRecords, postRewriteNumRecords);
assertEquals("Rows should be unchanged", originalRecords, rewrittenRecords);
}
use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.
the class TestRewriteDataFilesAction method testRewriteLargeTableHasResiduals.
@Test
public void testRewriteLargeTableHasResiduals() {
PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).build();
Map<String, String> options = Maps.newHashMap();
options.put(TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES, "100");
Table table = TABLES.create(SCHEMA, spec, options, tableLocation);
// all records belong to the same partition
List<ThreeColumnRecord> records = Lists.newArrayList();
for (int i = 0; i < 100; i++) {
records.add(new ThreeColumnRecord(i, String.valueOf(i), String.valueOf(i % 4)));
}
Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.class);
writeDF(df);
table.refresh();
CloseableIterable<FileScanTask> tasks = table.newScan().ignoreResiduals().filter(Expressions.equal("c3", "0")).planFiles();
for (FileScanTask task : tasks) {
Assert.assertEquals("Residuals must be ignored", Expressions.alwaysTrue(), task.residual());
}
List<DataFile> dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file));
Assert.assertEquals("Should have 2 data files before rewrite", 2, dataFiles.size());
Actions actions = Actions.forTable(table);
RewriteDataFilesActionResult result = actions.rewriteDataFiles().filter(Expressions.equal("c3", "0")).execute();
Assert.assertEquals("Action should rewrite 2 data files", 2, result.deletedDataFiles().size());
Assert.assertEquals("Action should add 1 data file", 1, result.addedDataFiles().size());
table.refresh();
Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
List<ThreeColumnRecord> actualRecords = resultDF.sort("c1").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList();
Assert.assertEquals("Rows must match", records, actualRecords);
}
use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.
the class TestRewriteDataFilesAction method testRewriteToOutputPartitionSpec.
@Test
public void testRewriteToOutputPartitionSpec() {
PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c1").build();
Map<String, String> options = Maps.newHashMap();
Table table = TABLES.create(SCHEMA, spec, options, tableLocation);
table.updateSpec().addField(Expressions.truncate("c2", 2)).commit();
Assert.assertEquals("Should have 2 partitions specs", 2, table.specs().size());
List<ThreeColumnRecord> records1 = Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"), new ThreeColumnRecord(1, "AAAAAAAAAA", "CCCC"));
writeRecords(records1);
List<ThreeColumnRecord> records2 = Lists.newArrayList(new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"), new ThreeColumnRecord(1, "BBBBBBBBBB", "DDDD"));
writeRecords(records2);
List<ThreeColumnRecord> records3 = Lists.newArrayList(new ThreeColumnRecord(2, "AAAAAAAAAA", "EEEE"), new ThreeColumnRecord(2, "AAAAAAAAAA", "GGGG"));
writeRecords(records3);
List<ThreeColumnRecord> records4 = Lists.newArrayList(new ThreeColumnRecord(2, "BBBBBBBBBB", "FFFF"), new ThreeColumnRecord(2, "BBBBBBBBBB", "HHHH"));
writeRecords(records4);
table.refresh();
CloseableIterable<FileScanTask> tasks = table.newScan().planFiles();
List<DataFile> dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file));
Assert.assertEquals("Should have 8 data files before rewrite", 8, dataFiles.size());
Dataset<Row> beforeResultDF = spark.read().format("iceberg").load(tableLocation);
List<ThreeColumnRecord> beforeActualFilteredRecords = beforeResultDF.sort("c1", "c2", "c3").filter("c1 = 1 AND c2 = 'BBBBBBBBBB'").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList();
Assert.assertEquals("Rows must match", records2, beforeActualFilteredRecords);
Actions actions = Actions.forTable(table);
RewriteDataFilesActionResult result = actions.rewriteDataFiles().outputSpecId(0).execute();
Assert.assertEquals("Action should rewrite 8 data files", 8, result.deletedDataFiles().size());
Assert.assertEquals("Action should add 2 data file", 2, result.addedDataFiles().size());
Assert.assertTrue(result.deletedDataFiles().stream().allMatch(df -> df.specId() == 1));
Assert.assertTrue(result.addedDataFiles().stream().allMatch(df -> df.specId() == 0));
table.refresh();
CloseableIterable<FileScanTask> tasks2 = table.newScan().planFiles();
List<DataFile> dataFiles2 = Lists.newArrayList(CloseableIterable.transform(tasks2, FileScanTask::file));
Assert.assertEquals("Should have 2 data files after rewrite", 2, dataFiles2.size());
// Should still have all the same data
List<ThreeColumnRecord> expectedRecords = Lists.newArrayList();
expectedRecords.addAll(records1);
expectedRecords.addAll(records2);
expectedRecords.addAll(records3);
expectedRecords.addAll(records4);
Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
List<ThreeColumnRecord> actualRecords = resultDF.sort("c1", "c2", "c3").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList();
Assert.assertEquals("Rows must match", expectedRecords, actualRecords);
List<ThreeColumnRecord> actualFilteredRecords = resultDF.sort("c1", "c2", "c3").filter("c1 = 1 AND c2 = 'BBBBBBBBBB'").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList();
Assert.assertEquals("Rows must match", records2, actualFilteredRecords);
List<ThreeColumnRecord> records5 = Lists.newArrayList(new ThreeColumnRecord(3, "CCCCCCCCCC", "FFFF"), new ThreeColumnRecord(3, "CCCCCCCCCC", "HHHH"));
writeRecords(records5);
expectedRecords.addAll(records5);
actualRecords = resultDF.sort("c1", "c2", "c3").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList();
Assert.assertEquals("Rows must match", expectedRecords, actualRecords);
}
use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.
the class TestRewriteDataFilesProcedure method insertData.
private void insertData(int filesCount) {
ThreeColumnRecord record1 = new ThreeColumnRecord(1, "foo", null);
ThreeColumnRecord record2 = new ThreeColumnRecord(2, "bar", null);
List<ThreeColumnRecord> records = Lists.newArrayList();
IntStream.range(0, filesCount / 2).forEach(i -> {
records.add(record1);
records.add(record2);
});
Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.class).repartition(filesCount);
try {
df.writeTo(tableName).append();
} catch (org.apache.spark.sql.catalyst.analysis.NoSuchTableException e) {
throw new RuntimeException(e);
}
}
Aggregations