use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.
the class TestRemoveOrphanFilesAction method testWapFilesAreKept.
@Test
public void testWapFilesAreKept() throws InterruptedException {
Map<String, String> props = Maps.newHashMap();
props.put(TableProperties.WRITE_AUDIT_PUBLISH_ENABLED, "true");
Table table = TABLES.create(SCHEMA, SPEC, props, tableLocation);
List<ThreeColumnRecord> records = Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"));
Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.class);
// normal write
df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation);
spark.conf().set("spark.wap.id", "1");
// wap write
df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation);
Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
List<ThreeColumnRecord> actualRecords = resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList();
Assert.assertEquals("Should not return data from the staged snapshot", records, actualRecords);
// sleep for 1 second to unsure files will be old enough
Thread.sleep(1000);
SparkActions actions = SparkActions.get();
DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute();
Assert.assertTrue("Should not delete any files", Iterables.isEmpty(result.orphanFileLocations()));
}
use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.
the class TestRemoveOrphanFilesAction method testMetadataFolderIsIntact.
@Test
public void testMetadataFolderIsIntact() throws InterruptedException {
// write data directly to the table location
Map<String, String> props = Maps.newHashMap();
props.put(TableProperties.WRITE_DATA_LOCATION, tableLocation);
Table table = TABLES.create(SCHEMA, SPEC, props, tableLocation);
List<ThreeColumnRecord> records = Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"));
Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1);
df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation);
df.write().mode("append").parquet(tableLocation + "/c2_trunc=AA/c3=AAAA");
// sleep for 1 second to unsure files will be old enough
Thread.sleep(1000);
SparkActions actions = SparkActions.get();
DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute();
Assert.assertEquals("Should delete 1 file", 1, Iterables.size(result.orphanFileLocations()));
Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
List<ThreeColumnRecord> actualRecords = resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList();
Assert.assertEquals("Rows must match", records, actualRecords);
}
use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.
the class TestRemoveOrphanFilesAction method testOlderThanTimestamp.
@Test
public void testOlderThanTimestamp() throws InterruptedException {
Table table = TABLES.create(SCHEMA, SPEC, Maps.newHashMap(), tableLocation);
List<ThreeColumnRecord> records = Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"));
Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1);
df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation);
df.write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA/c3=AAAA");
df.write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA/c3=AAAA");
Thread.sleep(1000);
long timestamp = System.currentTimeMillis();
Thread.sleep(1000);
df.write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA/c3=AAAA");
SparkActions actions = SparkActions.get();
DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table).olderThan(timestamp).execute();
Assert.assertEquals("Should delete only 2 files", 2, Iterables.size(result.orphanFileLocations()));
}
use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.
the class TestRemoveOrphanFilesAction method orphanedFileRemovedWithParallelTasks.
@Test
public void orphanedFileRemovedWithParallelTasks() throws InterruptedException, IOException {
Table table = TABLES.create(SCHEMA, SPEC, Maps.newHashMap(), tableLocation);
List<ThreeColumnRecord> records1 = Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"));
Dataset<Row> df1 = spark.createDataFrame(records1, ThreeColumnRecord.class).coalesce(1);
// original append
df1.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation);
List<ThreeColumnRecord> records2 = Lists.newArrayList(new ThreeColumnRecord(2, "AAAAAAAAAA", "AAAA"));
Dataset<Row> df2 = spark.createDataFrame(records2, ThreeColumnRecord.class).coalesce(1);
// dynamic partition overwrite
df2.select("c1", "c2", "c3").write().format("iceberg").mode("overwrite").save(tableLocation);
// second append
df2.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation);
df2.coalesce(1).write().mode("append").parquet(tableLocation + "/data");
df2.coalesce(1).write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA");
df2.coalesce(1).write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA/c3=AAAA");
df2.coalesce(1).write().mode("append").parquet(tableLocation + "/data/invalid/invalid");
// sleep for 1 second to unsure files will be old enough
Thread.sleep(1000);
Set<String> deletedFiles = Sets.newHashSet();
Set<String> deleteThreads = ConcurrentHashMap.newKeySet();
AtomicInteger deleteThreadsIndex = new AtomicInteger(0);
ExecutorService executorService = Executors.newFixedThreadPool(4, runnable -> {
Thread thread = new Thread(runnable);
thread.setName("remove-orphan-" + deleteThreadsIndex.getAndIncrement());
thread.setDaemon(true);
return thread;
});
DeleteOrphanFiles.Result result = SparkActions.get().deleteOrphanFiles(table).executeDeleteWith(executorService).olderThan(System.currentTimeMillis()).deleteWith(file -> {
deleteThreads.add(Thread.currentThread().getName());
deletedFiles.add(file);
}).execute();
// Verifies that the delete methods ran in the threads created by the provided ExecutorService ThreadFactory
Assert.assertEquals(deleteThreads, Sets.newHashSet("remove-orphan-0", "remove-orphan-1", "remove-orphan-2", "remove-orphan-3"));
Assert.assertEquals("Should delete 4 files", 4, deletedFiles.size());
}
use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.
the class TestRemoveOrphanFilesAction method testRemoveUnreachableMetadataVersionFiles.
@Test
public void testRemoveUnreachableMetadataVersionFiles() throws InterruptedException {
Map<String, String> props = Maps.newHashMap();
props.put(TableProperties.WRITE_DATA_LOCATION, tableLocation);
props.put(TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, "1");
Table table = TABLES.create(SCHEMA, SPEC, props, tableLocation);
List<ThreeColumnRecord> records = Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"));
Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.class);
df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation);
df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation);
// sleep for 1 second to unsure files will be old enough
Thread.sleep(1000);
SparkActions actions = SparkActions.get();
DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute();
Assert.assertEquals("Should delete 1 file", 1, Iterables.size(result.orphanFileLocations()));
Assert.assertTrue("Should remove v1 file", StreamSupport.stream(result.orphanFileLocations().spliterator(), false).anyMatch(file -> file.contains("v1.metadata.json")));
List<ThreeColumnRecord> expectedRecords = Lists.newArrayList();
expectedRecords.addAll(records);
expectedRecords.addAll(records);
Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
List<ThreeColumnRecord> actualRecords = resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList();
Assert.assertEquals("Rows must match", expectedRecords, actualRecords);
}
Aggregations