Search in sources :

Example 6 with ThreeColumnRecord

use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.

the class TestRemoveOrphanFilesAction method testWapFilesAreKept.

@Test
public void testWapFilesAreKept() throws InterruptedException {
    Map<String, String> props = Maps.newHashMap();
    props.put(TableProperties.WRITE_AUDIT_PUBLISH_ENABLED, "true");
    Table table = TABLES.create(SCHEMA, SPEC, props, tableLocation);
    List<ThreeColumnRecord> records = Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"));
    Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.class);
    // normal write
    df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation);
    spark.conf().set("spark.wap.id", "1");
    // wap write
    df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation);
    Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
    List<ThreeColumnRecord> actualRecords = resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList();
    Assert.assertEquals("Should not return data from the staged snapshot", records, actualRecords);
    // sleep for 1 second to unsure files will be old enough
    Thread.sleep(1000);
    SparkActions actions = SparkActions.get();
    DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute();
    Assert.assertTrue("Should not delete any files", Iterables.isEmpty(result.orphanFileLocations()));
}
Also used : Table(org.apache.iceberg.Table) DeleteOrphanFiles(org.apache.iceberg.actions.DeleteOrphanFiles) Row(org.apache.spark.sql.Row) ThreeColumnRecord(org.apache.iceberg.spark.source.ThreeColumnRecord) Test(org.junit.Test)

Example 7 with ThreeColumnRecord

use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.

the class TestRemoveOrphanFilesAction method testMetadataFolderIsIntact.

@Test
public void testMetadataFolderIsIntact() throws InterruptedException {
    // write data directly to the table location
    Map<String, String> props = Maps.newHashMap();
    props.put(TableProperties.WRITE_DATA_LOCATION, tableLocation);
    Table table = TABLES.create(SCHEMA, SPEC, props, tableLocation);
    List<ThreeColumnRecord> records = Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"));
    Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1);
    df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation);
    df.write().mode("append").parquet(tableLocation + "/c2_trunc=AA/c3=AAAA");
    // sleep for 1 second to unsure files will be old enough
    Thread.sleep(1000);
    SparkActions actions = SparkActions.get();
    DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute();
    Assert.assertEquals("Should delete 1 file", 1, Iterables.size(result.orphanFileLocations()));
    Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
    List<ThreeColumnRecord> actualRecords = resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList();
    Assert.assertEquals("Rows must match", records, actualRecords);
}
Also used : Table(org.apache.iceberg.Table) DeleteOrphanFiles(org.apache.iceberg.actions.DeleteOrphanFiles) Row(org.apache.spark.sql.Row) ThreeColumnRecord(org.apache.iceberg.spark.source.ThreeColumnRecord) Test(org.junit.Test)

Example 8 with ThreeColumnRecord

use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.

the class TestRemoveOrphanFilesAction method testOlderThanTimestamp.

@Test
public void testOlderThanTimestamp() throws InterruptedException {
    Table table = TABLES.create(SCHEMA, SPEC, Maps.newHashMap(), tableLocation);
    List<ThreeColumnRecord> records = Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"));
    Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1);
    df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation);
    df.write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA/c3=AAAA");
    df.write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA/c3=AAAA");
    Thread.sleep(1000);
    long timestamp = System.currentTimeMillis();
    Thread.sleep(1000);
    df.write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA/c3=AAAA");
    SparkActions actions = SparkActions.get();
    DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table).olderThan(timestamp).execute();
    Assert.assertEquals("Should delete only 2 files", 2, Iterables.size(result.orphanFileLocations()));
}
Also used : Table(org.apache.iceberg.Table) DeleteOrphanFiles(org.apache.iceberg.actions.DeleteOrphanFiles) Row(org.apache.spark.sql.Row) ThreeColumnRecord(org.apache.iceberg.spark.source.ThreeColumnRecord) Test(org.junit.Test)

Example 9 with ThreeColumnRecord

use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.

the class TestRemoveOrphanFilesAction method orphanedFileRemovedWithParallelTasks.

@Test
public void orphanedFileRemovedWithParallelTasks() throws InterruptedException, IOException {
    Table table = TABLES.create(SCHEMA, SPEC, Maps.newHashMap(), tableLocation);
    List<ThreeColumnRecord> records1 = Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"));
    Dataset<Row> df1 = spark.createDataFrame(records1, ThreeColumnRecord.class).coalesce(1);
    // original append
    df1.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation);
    List<ThreeColumnRecord> records2 = Lists.newArrayList(new ThreeColumnRecord(2, "AAAAAAAAAA", "AAAA"));
    Dataset<Row> df2 = spark.createDataFrame(records2, ThreeColumnRecord.class).coalesce(1);
    // dynamic partition overwrite
    df2.select("c1", "c2", "c3").write().format("iceberg").mode("overwrite").save(tableLocation);
    // second append
    df2.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation);
    df2.coalesce(1).write().mode("append").parquet(tableLocation + "/data");
    df2.coalesce(1).write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA");
    df2.coalesce(1).write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA/c3=AAAA");
    df2.coalesce(1).write().mode("append").parquet(tableLocation + "/data/invalid/invalid");
    // sleep for 1 second to unsure files will be old enough
    Thread.sleep(1000);
    Set<String> deletedFiles = Sets.newHashSet();
    Set<String> deleteThreads = ConcurrentHashMap.newKeySet();
    AtomicInteger deleteThreadsIndex = new AtomicInteger(0);
    ExecutorService executorService = Executors.newFixedThreadPool(4, runnable -> {
        Thread thread = new Thread(runnable);
        thread.setName("remove-orphan-" + deleteThreadsIndex.getAndIncrement());
        thread.setDaemon(true);
        return thread;
    });
    DeleteOrphanFiles.Result result = SparkActions.get().deleteOrphanFiles(table).executeDeleteWith(executorService).olderThan(System.currentTimeMillis()).deleteWith(file -> {
        deleteThreads.add(Thread.currentThread().getName());
        deletedFiles.add(file);
    }).execute();
    // Verifies that the delete methods ran in the threads created by the provided ExecutorService ThreadFactory
    Assert.assertEquals(deleteThreads, Sets.newHashSet("remove-orphan-0", "remove-orphan-1", "remove-orphan-2", "remove-orphan-3"));
    Assert.assertEquals("Should delete 4 files", 4, deletedFiles.size());
}
Also used : Arrays(java.util.Arrays) Types(org.apache.iceberg.types.Types) Dataset(org.apache.spark.sql.Dataset) FileSystem(org.apache.hadoop.fs.FileSystem) NestedField.optional(org.apache.iceberg.types.Types.NestedField.optional) DeleteOrphanFiles(org.apache.iceberg.actions.DeleteOrphanFiles) FileStatus(org.apache.hadoop.fs.FileStatus) Lists(org.apache.iceberg.relocated.com.google.common.collect.Lists) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) Map(java.util.Map) Configuration(org.apache.hadoop.conf.Configuration) Path(org.apache.hadoop.fs.Path) StreamSupport(java.util.stream.StreamSupport) Namespace(org.apache.iceberg.catalog.Namespace) ExecutorService(java.util.concurrent.ExecutorService) ThreeColumnRecord(org.apache.iceberg.spark.source.ThreeColumnRecord) Before(org.junit.Before) AssertHelpers(org.apache.iceberg.AssertHelpers) TableIdentifier(org.apache.iceberg.catalog.TableIdentifier) HadoopCatalog(org.apache.iceberg.hadoop.HadoopCatalog) Table(org.apache.iceberg.Table) HiddenPathFilter(org.apache.iceberg.hadoop.HiddenPathFilter) HadoopTables(org.apache.iceberg.hadoop.HadoopTables) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) Maps(org.apache.iceberg.relocated.com.google.common.collect.Maps) Set(java.util.Set) IOException(java.io.IOException) Iterables(org.apache.iceberg.relocated.com.google.common.collect.Iterables) Test(org.junit.Test) Row(org.apache.spark.sql.Row) Schema(org.apache.iceberg.Schema) Collectors(java.util.stream.Collectors) File(java.io.File) Executors(java.util.concurrent.Executors) Encoders(org.apache.spark.sql.Encoders) ValidationException(org.apache.iceberg.exceptions.ValidationException) Sets(org.apache.iceberg.relocated.com.google.common.collect.Sets) List(java.util.List) Rule(org.junit.Rule) PartitionSpec(org.apache.iceberg.PartitionSpec) TableProperties(org.apache.iceberg.TableProperties) Assert(org.junit.Assert) SparkTestBase(org.apache.iceberg.spark.SparkTestBase) TemporaryFolder(org.junit.rules.TemporaryFolder) Snapshot(org.apache.iceberg.Snapshot) Table(org.apache.iceberg.Table) DeleteOrphanFiles(org.apache.iceberg.actions.DeleteOrphanFiles) ThreeColumnRecord(org.apache.iceberg.spark.source.ThreeColumnRecord) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) ExecutorService(java.util.concurrent.ExecutorService) Row(org.apache.spark.sql.Row) Test(org.junit.Test)

Example 10 with ThreeColumnRecord

use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.

the class TestRemoveOrphanFilesAction method testRemoveUnreachableMetadataVersionFiles.

@Test
public void testRemoveUnreachableMetadataVersionFiles() throws InterruptedException {
    Map<String, String> props = Maps.newHashMap();
    props.put(TableProperties.WRITE_DATA_LOCATION, tableLocation);
    props.put(TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, "1");
    Table table = TABLES.create(SCHEMA, SPEC, props, tableLocation);
    List<ThreeColumnRecord> records = Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"));
    Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.class);
    df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation);
    df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation);
    // sleep for 1 second to unsure files will be old enough
    Thread.sleep(1000);
    SparkActions actions = SparkActions.get();
    DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute();
    Assert.assertEquals("Should delete 1 file", 1, Iterables.size(result.orphanFileLocations()));
    Assert.assertTrue("Should remove v1 file", StreamSupport.stream(result.orphanFileLocations().spliterator(), false).anyMatch(file -> file.contains("v1.metadata.json")));
    List<ThreeColumnRecord> expectedRecords = Lists.newArrayList();
    expectedRecords.addAll(records);
    expectedRecords.addAll(records);
    Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
    List<ThreeColumnRecord> actualRecords = resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList();
    Assert.assertEquals("Rows must match", expectedRecords, actualRecords);
}
Also used : Arrays(java.util.Arrays) Types(org.apache.iceberg.types.Types) Dataset(org.apache.spark.sql.Dataset) FileSystem(org.apache.hadoop.fs.FileSystem) NestedField.optional(org.apache.iceberg.types.Types.NestedField.optional) DeleteOrphanFiles(org.apache.iceberg.actions.DeleteOrphanFiles) FileStatus(org.apache.hadoop.fs.FileStatus) Lists(org.apache.iceberg.relocated.com.google.common.collect.Lists) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) Map(java.util.Map) Configuration(org.apache.hadoop.conf.Configuration) Path(org.apache.hadoop.fs.Path) StreamSupport(java.util.stream.StreamSupport) Namespace(org.apache.iceberg.catalog.Namespace) ExecutorService(java.util.concurrent.ExecutorService) ThreeColumnRecord(org.apache.iceberg.spark.source.ThreeColumnRecord) Before(org.junit.Before) AssertHelpers(org.apache.iceberg.AssertHelpers) TableIdentifier(org.apache.iceberg.catalog.TableIdentifier) HadoopCatalog(org.apache.iceberg.hadoop.HadoopCatalog) Table(org.apache.iceberg.Table) HiddenPathFilter(org.apache.iceberg.hadoop.HiddenPathFilter) HadoopTables(org.apache.iceberg.hadoop.HadoopTables) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) Maps(org.apache.iceberg.relocated.com.google.common.collect.Maps) Set(java.util.Set) IOException(java.io.IOException) Iterables(org.apache.iceberg.relocated.com.google.common.collect.Iterables) Test(org.junit.Test) Row(org.apache.spark.sql.Row) Schema(org.apache.iceberg.Schema) Collectors(java.util.stream.Collectors) File(java.io.File) Executors(java.util.concurrent.Executors) Encoders(org.apache.spark.sql.Encoders) ValidationException(org.apache.iceberg.exceptions.ValidationException) Sets(org.apache.iceberg.relocated.com.google.common.collect.Sets) List(java.util.List) Rule(org.junit.Rule) PartitionSpec(org.apache.iceberg.PartitionSpec) TableProperties(org.apache.iceberg.TableProperties) Assert(org.junit.Assert) SparkTestBase(org.apache.iceberg.spark.SparkTestBase) TemporaryFolder(org.junit.rules.TemporaryFolder) Snapshot(org.apache.iceberg.Snapshot) Table(org.apache.iceberg.Table) DeleteOrphanFiles(org.apache.iceberg.actions.DeleteOrphanFiles) Row(org.apache.spark.sql.Row) ThreeColumnRecord(org.apache.iceberg.spark.source.ThreeColumnRecord) Test(org.junit.Test)

Aggregations

ThreeColumnRecord (org.apache.iceberg.spark.source.ThreeColumnRecord)34 Row (org.apache.spark.sql.Row)33 Test (org.junit.Test)32 Table (org.apache.iceberg.Table)26 PartitionSpec (org.apache.iceberg.PartitionSpec)18 DeleteOrphanFiles (org.apache.iceberg.actions.DeleteOrphanFiles)12 File (java.io.File)11 Configuration (org.apache.hadoop.conf.Configuration)11 List (java.util.List)10 Map (java.util.Map)10 Schema (org.apache.iceberg.Schema)10 Snapshot (org.apache.iceberg.Snapshot)10 TableProperties (org.apache.iceberg.TableProperties)10 HadoopTables (org.apache.iceberg.hadoop.HadoopTables)10 Lists (org.apache.iceberg.relocated.com.google.common.collect.Lists)10 Maps (org.apache.iceberg.relocated.com.google.common.collect.Maps)10 SparkTestBase (org.apache.iceberg.spark.SparkTestBase)10 Types (org.apache.iceberg.types.Types)10 NestedField.optional (org.apache.iceberg.types.Types.NestedField.optional)10 Dataset (org.apache.spark.sql.Dataset)10