Search in sources :

Example 1 with DeleteReachableFiles

use of org.apache.iceberg.actions.DeleteReachableFiles in project iceberg by apache.

the class TestDeleteReachableFilesAction method testIgnoreMetadataFilesNotFound.

@Test
public void testIgnoreMetadataFilesNotFound() {
    table.updateProperties().set(TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, "1").commit();
    table.newAppend().appendFile(FILE_A).commit();
    // There are three metadata json files at this point
    DeleteOrphanFiles.Result result = sparkActions().deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute();
    Assert.assertEquals("Should delete 1 file", 1, Iterables.size(result.orphanFileLocations()));
    Assert.assertTrue("Should remove v1 file", StreamSupport.stream(result.orphanFileLocations().spliterator(), false).anyMatch(file -> file.contains("v1.metadata.json")));
    DeleteReachableFiles baseRemoveFilesSparkAction = sparkActions().deleteReachableFiles(metadataLocation(table)).io(table.io());
    DeleteReachableFiles.Result res = baseRemoveFilesSparkAction.execute();
    checkRemoveFilesResults(1, 1, 1, 4, res);
}
Also used : ImmutableSet(org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet) Types(org.apache.iceberg.types.Types) ImmutableMap(org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap) NestedField.optional(org.apache.iceberg.types.Types.NestedField.optional) DeleteOrphanFiles(org.apache.iceberg.actions.DeleteOrphanFiles) DeleteReachableFiles(org.apache.iceberg.actions.DeleteReachableFiles) Lists(org.apache.iceberg.relocated.com.google.common.collect.Lists) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) DataFiles(org.apache.iceberg.DataFiles) Configuration(org.apache.hadoop.conf.Configuration) StreamSupport(java.util.stream.StreamSupport) DataFile(org.apache.iceberg.DataFile) Before(org.junit.Before) AssertHelpers(org.apache.iceberg.AssertHelpers) Table(org.apache.iceberg.Table) HadoopTables(org.apache.iceberg.hadoop.HadoopTables) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) Maps(org.apache.iceberg.relocated.com.google.common.collect.Maps) Set(java.util.Set) HasTableOperations(org.apache.iceberg.HasTableOperations) Iterables(org.apache.iceberg.relocated.com.google.common.collect.Iterables) Test(org.junit.Test) Schema(org.apache.iceberg.Schema) File(java.io.File) Executors(java.util.concurrent.Executors) ActionsProvider(org.apache.iceberg.actions.ActionsProvider) ValidationException(org.apache.iceberg.exceptions.ValidationException) Sets(org.apache.iceberg.relocated.com.google.common.collect.Sets) Rule(org.junit.Rule) PartitionSpec(org.apache.iceberg.PartitionSpec) TableProperties(org.apache.iceberg.TableProperties) TestHelpers(org.apache.iceberg.TestHelpers) Assert(org.junit.Assert) SparkTestBase(org.apache.iceberg.spark.SparkTestBase) TemporaryFolder(org.junit.rules.TemporaryFolder) DeleteOrphanFiles(org.apache.iceberg.actions.DeleteOrphanFiles) DeleteReachableFiles(org.apache.iceberg.actions.DeleteReachableFiles) Test(org.junit.Test)

Example 2 with DeleteReachableFiles

use of org.apache.iceberg.actions.DeleteReachableFiles in project iceberg by apache.

the class TestDeleteReachableFilesAction method testRemoveFilesActionWithDefaultIO.

@Test
public void testRemoveFilesActionWithDefaultIO() {
    table.newAppend().appendFile(FILE_A).commit();
    table.newAppend().appendFile(FILE_B).commit();
    // IO not set explicitly on removeReachableFiles action
    // IO defaults to HadoopFileIO
    DeleteReachableFiles baseRemoveFilesSparkAction = sparkActions().deleteReachableFiles(metadataLocation(table));
    checkRemoveFilesResults(2, 2, 2, 4, baseRemoveFilesSparkAction.execute());
}
Also used : DeleteReachableFiles(org.apache.iceberg.actions.DeleteReachableFiles) Test(org.junit.Test)

Example 3 with DeleteReachableFiles

use of org.apache.iceberg.actions.DeleteReachableFiles in project iceberg by apache.

the class TestDeleteReachableFilesAction method dataFilesCleanupWithParallelTasks.

@Test
public void dataFilesCleanupWithParallelTasks() {
    table.newFastAppend().appendFile(FILE_A).commit();
    table.newFastAppend().appendFile(FILE_B).commit();
    table.newRewrite().rewriteFiles(ImmutableSet.of(FILE_B), ImmutableSet.of(FILE_D)).commit();
    table.newRewrite().rewriteFiles(ImmutableSet.of(FILE_A), ImmutableSet.of(FILE_C)).commit();
    Set<String> deletedFiles = ConcurrentHashMap.newKeySet();
    Set<String> deleteThreads = ConcurrentHashMap.newKeySet();
    AtomicInteger deleteThreadsIndex = new AtomicInteger(0);
    DeleteReachableFiles.Result result = sparkActions().deleteReachableFiles(metadataLocation(table)).io(table.io()).executeDeleteWith(Executors.newFixedThreadPool(4, runnable -> {
        Thread thread = new Thread(runnable);
        thread.setName("remove-files-" + deleteThreadsIndex.getAndIncrement());
        // daemon threads will be terminated abruptly when the JVM exits
        thread.setDaemon(true);
        return thread;
    })).deleteWith(s -> {
        deleteThreads.add(Thread.currentThread().getName());
        deletedFiles.add(s);
    }).execute();
    // Verifies that the delete methods ran in the threads created by the provided ExecutorService ThreadFactory
    Assert.assertEquals(deleteThreads, Sets.newHashSet("remove-files-0", "remove-files-1", "remove-files-2", "remove-files-3"));
    Lists.newArrayList(FILE_A, FILE_B, FILE_C, FILE_D).forEach(file -> Assert.assertTrue("FILE_A should be deleted", deletedFiles.contains(FILE_A.path().toString())));
    checkRemoveFilesResults(4L, 6L, 4L, 6, result);
}
Also used : ImmutableSet(org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet) Types(org.apache.iceberg.types.Types) ImmutableMap(org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap) NestedField.optional(org.apache.iceberg.types.Types.NestedField.optional) DeleteOrphanFiles(org.apache.iceberg.actions.DeleteOrphanFiles) DeleteReachableFiles(org.apache.iceberg.actions.DeleteReachableFiles) Lists(org.apache.iceberg.relocated.com.google.common.collect.Lists) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) DataFiles(org.apache.iceberg.DataFiles) Configuration(org.apache.hadoop.conf.Configuration) StreamSupport(java.util.stream.StreamSupport) DataFile(org.apache.iceberg.DataFile) Before(org.junit.Before) AssertHelpers(org.apache.iceberg.AssertHelpers) Table(org.apache.iceberg.Table) HadoopTables(org.apache.iceberg.hadoop.HadoopTables) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) Maps(org.apache.iceberg.relocated.com.google.common.collect.Maps) Set(java.util.Set) HasTableOperations(org.apache.iceberg.HasTableOperations) Iterables(org.apache.iceberg.relocated.com.google.common.collect.Iterables) Test(org.junit.Test) Schema(org.apache.iceberg.Schema) File(java.io.File) Executors(java.util.concurrent.Executors) ActionsProvider(org.apache.iceberg.actions.ActionsProvider) ValidationException(org.apache.iceberg.exceptions.ValidationException) Sets(org.apache.iceberg.relocated.com.google.common.collect.Sets) Rule(org.junit.Rule) PartitionSpec(org.apache.iceberg.PartitionSpec) TableProperties(org.apache.iceberg.TableProperties) TestHelpers(org.apache.iceberg.TestHelpers) Assert(org.junit.Assert) SparkTestBase(org.apache.iceberg.spark.SparkTestBase) TemporaryFolder(org.junit.rules.TemporaryFolder) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) DeleteReachableFiles(org.apache.iceberg.actions.DeleteReachableFiles) Test(org.junit.Test)

Example 4 with DeleteReachableFiles

use of org.apache.iceberg.actions.DeleteReachableFiles in project iceberg by apache.

the class TestDeleteReachableFilesAction method testEmptyIOThrowsException.

@Test
public void testEmptyIOThrowsException() {
    DeleteReachableFiles baseRemoveFilesSparkAction = sparkActions().deleteReachableFiles(metadataLocation(table)).io(null);
    AssertHelpers.assertThrows("FileIO needs to be set to use RemoveFiles action", IllegalArgumentException.class, "File IO cannot be null", baseRemoveFilesSparkAction::execute);
}
Also used : DeleteReachableFiles(org.apache.iceberg.actions.DeleteReachableFiles) Test(org.junit.Test)

Example 5 with DeleteReachableFiles

use of org.apache.iceberg.actions.DeleteReachableFiles in project iceberg by apache.

the class TestDeleteReachableFilesAction method testRemoveFilesActionWithReducedVersionsTable.

@Test
public void testRemoveFilesActionWithReducedVersionsTable() {
    table.updateProperties().set(TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, "2").commit();
    table.newAppend().appendFile(FILE_A).commit();
    table.newAppend().appendFile(FILE_B).commit();
    table.newAppend().appendFile(FILE_B).commit();
    table.newAppend().appendFile(FILE_C).commit();
    table.newAppend().appendFile(FILE_D).commit();
    DeleteReachableFiles baseRemoveFilesSparkAction = sparkActions().deleteReachableFiles(metadataLocation(table)).io(table.io());
    DeleteReachableFiles.Result result = baseRemoveFilesSparkAction.execute();
    checkRemoveFilesResults(4, 5, 5, 8, result);
}
Also used : DeleteReachableFiles(org.apache.iceberg.actions.DeleteReachableFiles) Test(org.junit.Test)

Aggregations

DeleteReachableFiles (org.apache.iceberg.actions.DeleteReachableFiles)6 Test (org.junit.Test)6 File (java.io.File)2 Set (java.util.Set)2 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)2 Executors (java.util.concurrent.Executors)2 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)2 StreamSupport (java.util.stream.StreamSupport)2 Configuration (org.apache.hadoop.conf.Configuration)2 AssertHelpers (org.apache.iceberg.AssertHelpers)2 DataFile (org.apache.iceberg.DataFile)2 DataFiles (org.apache.iceberg.DataFiles)2 HasTableOperations (org.apache.iceberg.HasTableOperations)2 PartitionSpec (org.apache.iceberg.PartitionSpec)2 Schema (org.apache.iceberg.Schema)2 Table (org.apache.iceberg.Table)2 TableProperties (org.apache.iceberg.TableProperties)2 TestHelpers (org.apache.iceberg.TestHelpers)2 ActionsProvider (org.apache.iceberg.actions.ActionsProvider)2 DeleteOrphanFiles (org.apache.iceberg.actions.DeleteOrphanFiles)2