Search in sources :

Example 1 with Result

use of org.apache.iceberg.actions.RewriteDataFiles.Result in project iceberg by apache.

the class TestRewriteDataFilesAction method testParallelPartialProgressWithCommitFailure.

@Test
public void testParallelPartialProgressWithCommitFailure() {
    Table table = createTable(20);
    int fileSize = averageFileSize(table);
    List<Object[]> originalData = currentData();
    BaseRewriteDataFilesSparkAction realRewrite = (org.apache.iceberg.spark.actions.BaseRewriteDataFilesSparkAction) basicRewrite(table).option(RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)).option(RewriteDataFiles.MAX_CONCURRENT_FILE_GROUP_REWRITES, "3").option(RewriteDataFiles.PARTIAL_PROGRESS_ENABLED, "true").option(RewriteDataFiles.PARTIAL_PROGRESS_MAX_COMMITS, "3");
    BaseRewriteDataFilesSparkAction spyRewrite = spy(realRewrite);
    RewriteDataFilesCommitManager util = spy(new RewriteDataFilesCommitManager(table));
    // First and Third commits work, second does not
    doCallRealMethod().doThrow(new RuntimeException("Commit Failed")).doCallRealMethod().when(util).commitFileGroups(any());
    doReturn(util).when(spyRewrite).commitManager(table.currentSnapshot().snapshotId());
    RewriteDataFiles.Result result = spyRewrite.execute();
    // Commit 1: 4/4 + Commit 2 failed 0/4 + Commit 3: 2/2 == 6 out of 10 total groups comitted
    Assert.assertEquals("Should have 6 fileGroups", 6, result.rewriteResults().size());
    table.refresh();
    List<Object[]> postRewriteData = currentData();
    assertEquals("We shouldn't have changed the data", originalData, postRewriteData);
    // Only 2 new commits because we broke one
    shouldHaveSnapshots(table, 3);
    shouldHaveNoOrphans(table);
    shouldHaveACleanCache(table);
}
Also used : RewriteDataFilesCommitManager(org.apache.iceberg.actions.RewriteDataFilesCommitManager) Table(org.apache.iceberg.Table) Result(org.apache.iceberg.actions.RewriteDataFiles.Result) RewriteDataFiles(org.apache.iceberg.actions.RewriteDataFiles) Test(org.junit.Test)

Example 2 with Result

use of org.apache.iceberg.actions.RewriteDataFiles.Result in project iceberg by apache.

the class TestRewriteDataFilesAction method testBinPackWithStartingSequenceNumberV1Compatibility.

@Test
public void testBinPackWithStartingSequenceNumberV1Compatibility() {
    Table table = createTablePartitioned(4, 2);
    shouldHaveFiles(table, 8);
    List<Object[]> expectedRecords = currentData();
    table.refresh();
    long oldSequenceNumber = table.currentSnapshot().sequenceNumber();
    Assert.assertEquals("Table sequence number should be 0", 0, oldSequenceNumber);
    Result result = basicRewrite(table).option(RewriteDataFiles.USE_STARTING_SEQUENCE_NUMBER, "true").execute();
    Assert.assertEquals("Action should rewrite 8 data files", 8, result.rewrittenDataFilesCount());
    Assert.assertEquals("Action should add 4 data file", 4, result.addedDataFilesCount());
    shouldHaveFiles(table, 4);
    List<Object[]> actualRecords = currentData();
    assertEquals("Rows must match", expectedRecords, actualRecords);
    table.refresh();
    Assert.assertEquals("Table sequence number should still be 0", oldSequenceNumber, table.currentSnapshot().sequenceNumber());
    Dataset<Row> rows = SparkTableUtil.loadMetadataTable(spark, table, MetadataTableType.ENTRIES);
    for (Row row : rows.collectAsList()) {
        Assert.assertEquals("Expect sequence number 0 for all entries", oldSequenceNumber, row.getLong(2));
    }
}
Also used : Table(org.apache.iceberg.Table) Row(org.apache.spark.sql.Row) Result(org.apache.iceberg.actions.RewriteDataFiles.Result) Test(org.junit.Test)

Example 3 with Result

use of org.apache.iceberg.actions.RewriteDataFiles.Result in project iceberg by apache.

the class TestRewriteDataFilesAction method testSimpleSort.

@Test
public void testSimpleSort() {
    Table table = createTable(20);
    shouldHaveFiles(table, 20);
    table.replaceSortOrder().asc("c2").commit();
    shouldHaveLastCommitUnsorted(table, "c2");
    List<Object[]> originalData = currentData();
    RewriteDataFiles.Result result = basicRewrite(table).sort().option(SortStrategy.MIN_INPUT_FILES, "1").option(SortStrategy.REWRITE_ALL, "true").option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table))).execute();
    Assert.assertEquals("Should have 1 fileGroups", result.rewriteResults().size(), 1);
    table.refresh();
    List<Object[]> postRewriteData = currentData();
    assertEquals("We shouldn't have changed the data", originalData, postRewriteData);
    shouldHaveSnapshots(table, 2);
    shouldHaveACleanCache(table);
    shouldHaveMultipleFiles(table);
    shouldHaveLastCommitSorted(table, "c2");
}
Also used : Table(org.apache.iceberg.Table) Result(org.apache.iceberg.actions.RewriteDataFiles.Result) RewriteDataFiles(org.apache.iceberg.actions.RewriteDataFiles) Test(org.junit.Test)

Example 4 with Result

use of org.apache.iceberg.actions.RewriteDataFiles.Result in project iceberg by apache.

the class TestRewriteDataFilesAction method testBinPackCombineMixedFiles.

@Test
public void testBinPackCombineMixedFiles() {
    // 400000
    Table table = createTable(1);
    shouldHaveFiles(table, 1);
    // Add one more small file, and one large file
    writeRecords(1, SCALE);
    writeRecords(1, SCALE * 3);
    shouldHaveFiles(table, 3);
    List<Object[]> expectedRecords = currentData();
    int targetSize = averageFileSize(table);
    Result result = basicRewrite(table).option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(targetSize + 1000)).option(BinPackStrategy.MAX_FILE_SIZE_BYTES, Integer.toString(targetSize + 80000)).option(BinPackStrategy.MIN_FILE_SIZE_BYTES, Integer.toString(targetSize - 1000)).execute();
    Assert.assertEquals("Action should delete 3 data files", 3, result.rewrittenDataFilesCount());
    // Should Split the big files into 3 pieces, one of which should be combined with the two smaller files
    Assert.assertEquals("Action should add 3 data files", 3, result.addedDataFilesCount());
    shouldHaveFiles(table, 3);
    List<Object[]> actualRecords = currentData();
    assertEquals("Rows must match", expectedRecords, actualRecords);
}
Also used : Table(org.apache.iceberg.Table) Result(org.apache.iceberg.actions.RewriteDataFiles.Result) Test(org.junit.Test)

Example 5 with Result

use of org.apache.iceberg.actions.RewriteDataFiles.Result in project iceberg by apache.

the class TestRewriteDataFilesAction method testPartialProgressEnabled.

@Test
public void testPartialProgressEnabled() {
    Table table = createTable(20);
    int fileSize = averageFileSize(table);
    List<Object[]> originalData = currentData();
    // Perform a rewrite but only allow 2 files to be compacted at a time
    RewriteDataFiles.Result result = basicRewrite(table).option(RewriteDataFiles.PARTIAL_PROGRESS_ENABLED, "true").option(RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)).option(RewriteDataFiles.PARTIAL_PROGRESS_MAX_COMMITS, "10").execute();
    Assert.assertEquals("Should have 10 fileGroups", result.rewriteResults().size(), 10);
    table.refresh();
    shouldHaveSnapshots(table, 11);
    shouldHaveACleanCache(table);
    List<Object[]> postRewriteData = currentData();
    assertEquals("We shouldn't have changed the data", originalData, postRewriteData);
}
Also used : Table(org.apache.iceberg.Table) Result(org.apache.iceberg.actions.RewriteDataFiles.Result) RewriteDataFiles(org.apache.iceberg.actions.RewriteDataFiles) Test(org.junit.Test)

Aggregations

Table (org.apache.iceberg.Table)24 Result (org.apache.iceberg.actions.RewriteDataFiles.Result)24 Test (org.junit.Test)24 RewriteDataFiles (org.apache.iceberg.actions.RewriteDataFiles)13 FileScanTask (org.apache.iceberg.FileScanTask)3 Row (org.apache.spark.sql.Row)3 ContentFile (org.apache.iceberg.ContentFile)2 DataFile (org.apache.iceberg.DataFile)2 RowDelta (org.apache.iceberg.RowDelta)2 PartitionSpec (org.apache.iceberg.PartitionSpec)1 RewriteDataFilesCommitManager (org.apache.iceberg.actions.RewriteDataFilesCommitManager)1 ThreeColumnRecord (org.apache.iceberg.spark.source.ThreeColumnRecord)1