use of org.apache.iceberg.actions.RewriteDataFiles.Result in project iceberg by apache.
the class TestRewriteDataFilesAction method testParallelPartialProgressWithCommitFailure.
@Test
public void testParallelPartialProgressWithCommitFailure() {
Table table = createTable(20);
int fileSize = averageFileSize(table);
List<Object[]> originalData = currentData();
BaseRewriteDataFilesSparkAction realRewrite = (org.apache.iceberg.spark.actions.BaseRewriteDataFilesSparkAction) basicRewrite(table).option(RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)).option(RewriteDataFiles.MAX_CONCURRENT_FILE_GROUP_REWRITES, "3").option(RewriteDataFiles.PARTIAL_PROGRESS_ENABLED, "true").option(RewriteDataFiles.PARTIAL_PROGRESS_MAX_COMMITS, "3");
BaseRewriteDataFilesSparkAction spyRewrite = spy(realRewrite);
RewriteDataFilesCommitManager util = spy(new RewriteDataFilesCommitManager(table));
// First and Third commits work, second does not
doCallRealMethod().doThrow(new RuntimeException("Commit Failed")).doCallRealMethod().when(util).commitFileGroups(any());
doReturn(util).when(spyRewrite).commitManager(table.currentSnapshot().snapshotId());
RewriteDataFiles.Result result = spyRewrite.execute();
// Commit 1: 4/4 + Commit 2 failed 0/4 + Commit 3: 2/2 == 6 out of 10 total groups comitted
Assert.assertEquals("Should have 6 fileGroups", 6, result.rewriteResults().size());
table.refresh();
List<Object[]> postRewriteData = currentData();
assertEquals("We shouldn't have changed the data", originalData, postRewriteData);
// Only 2 new commits because we broke one
shouldHaveSnapshots(table, 3);
shouldHaveNoOrphans(table);
shouldHaveACleanCache(table);
}
use of org.apache.iceberg.actions.RewriteDataFiles.Result in project iceberg by apache.
the class TestRewriteDataFilesAction method testBinPackWithStartingSequenceNumberV1Compatibility.
@Test
public void testBinPackWithStartingSequenceNumberV1Compatibility() {
Table table = createTablePartitioned(4, 2);
shouldHaveFiles(table, 8);
List<Object[]> expectedRecords = currentData();
table.refresh();
long oldSequenceNumber = table.currentSnapshot().sequenceNumber();
Assert.assertEquals("Table sequence number should be 0", 0, oldSequenceNumber);
Result result = basicRewrite(table).option(RewriteDataFiles.USE_STARTING_SEQUENCE_NUMBER, "true").execute();
Assert.assertEquals("Action should rewrite 8 data files", 8, result.rewrittenDataFilesCount());
Assert.assertEquals("Action should add 4 data file", 4, result.addedDataFilesCount());
shouldHaveFiles(table, 4);
List<Object[]> actualRecords = currentData();
assertEquals("Rows must match", expectedRecords, actualRecords);
table.refresh();
Assert.assertEquals("Table sequence number should still be 0", oldSequenceNumber, table.currentSnapshot().sequenceNumber());
Dataset<Row> rows = SparkTableUtil.loadMetadataTable(spark, table, MetadataTableType.ENTRIES);
for (Row row : rows.collectAsList()) {
Assert.assertEquals("Expect sequence number 0 for all entries", oldSequenceNumber, row.getLong(2));
}
}
use of org.apache.iceberg.actions.RewriteDataFiles.Result in project iceberg by apache.
the class TestRewriteDataFilesAction method testSimpleSort.
@Test
public void testSimpleSort() {
Table table = createTable(20);
shouldHaveFiles(table, 20);
table.replaceSortOrder().asc("c2").commit();
shouldHaveLastCommitUnsorted(table, "c2");
List<Object[]> originalData = currentData();
RewriteDataFiles.Result result = basicRewrite(table).sort().option(SortStrategy.MIN_INPUT_FILES, "1").option(SortStrategy.REWRITE_ALL, "true").option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table))).execute();
Assert.assertEquals("Should have 1 fileGroups", result.rewriteResults().size(), 1);
table.refresh();
List<Object[]> postRewriteData = currentData();
assertEquals("We shouldn't have changed the data", originalData, postRewriteData);
shouldHaveSnapshots(table, 2);
shouldHaveACleanCache(table);
shouldHaveMultipleFiles(table);
shouldHaveLastCommitSorted(table, "c2");
}
use of org.apache.iceberg.actions.RewriteDataFiles.Result in project iceberg by apache.
the class TestRewriteDataFilesAction method testBinPackCombineMixedFiles.
@Test
public void testBinPackCombineMixedFiles() {
// 400000
Table table = createTable(1);
shouldHaveFiles(table, 1);
// Add one more small file, and one large file
writeRecords(1, SCALE);
writeRecords(1, SCALE * 3);
shouldHaveFiles(table, 3);
List<Object[]> expectedRecords = currentData();
int targetSize = averageFileSize(table);
Result result = basicRewrite(table).option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(targetSize + 1000)).option(BinPackStrategy.MAX_FILE_SIZE_BYTES, Integer.toString(targetSize + 80000)).option(BinPackStrategy.MIN_FILE_SIZE_BYTES, Integer.toString(targetSize - 1000)).execute();
Assert.assertEquals("Action should delete 3 data files", 3, result.rewrittenDataFilesCount());
// Should Split the big files into 3 pieces, one of which should be combined with the two smaller files
Assert.assertEquals("Action should add 3 data files", 3, result.addedDataFilesCount());
shouldHaveFiles(table, 3);
List<Object[]> actualRecords = currentData();
assertEquals("Rows must match", expectedRecords, actualRecords);
}
use of org.apache.iceberg.actions.RewriteDataFiles.Result in project iceberg by apache.
the class TestRewriteDataFilesAction method testPartialProgressEnabled.
@Test
public void testPartialProgressEnabled() {
Table table = createTable(20);
int fileSize = averageFileSize(table);
List<Object[]> originalData = currentData();
// Perform a rewrite but only allow 2 files to be compacted at a time
RewriteDataFiles.Result result = basicRewrite(table).option(RewriteDataFiles.PARTIAL_PROGRESS_ENABLED, "true").option(RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)).option(RewriteDataFiles.PARTIAL_PROGRESS_MAX_COMMITS, "10").execute();
Assert.assertEquals("Should have 10 fileGroups", result.rewriteResults().size(), 10);
table.refresh();
shouldHaveSnapshots(table, 11);
shouldHaveACleanCache(table);
List<Object[]> postRewriteData = currentData();
assertEquals("We shouldn't have changed the data", originalData, postRewriteData);
}
Aggregations