use of org.apache.iceberg.actions.DeleteOrphanFiles in project iceberg by apache.
the class RemoveOrphanFilesProcedure method call.
@Override
public InternalRow[] call(InternalRow args) {
Identifier tableIdent = toIdentifier(args.getString(0), PARAMETERS[0].name());
Long olderThanMillis = args.isNullAt(1) ? null : DateTimeUtil.microsToMillis(args.getLong(1));
String location = args.isNullAt(2) ? null : args.getString(2);
boolean dryRun = args.isNullAt(3) ? false : args.getBoolean(3);
Integer maxConcurrentDeletes = args.isNullAt(4) ? null : args.getInt(4);
Preconditions.checkArgument(maxConcurrentDeletes == null || maxConcurrentDeletes > 0, "max_concurrent_deletes should have value > 0, value: " + maxConcurrentDeletes);
return withIcebergTable(tableIdent, table -> {
DeleteOrphanFiles action = actions().deleteOrphanFiles(table);
if (olderThanMillis != null) {
boolean isTesting = Boolean.parseBoolean(spark().conf().get("spark.testing", "false"));
if (!isTesting) {
validateInterval(olderThanMillis);
}
action.olderThan(olderThanMillis);
}
if (location != null) {
action.location(location);
}
if (dryRun) {
action.deleteWith(file -> {
});
}
if (maxConcurrentDeletes != null && maxConcurrentDeletes > 0) {
action.executeDeleteWith(executorService(maxConcurrentDeletes, "remove-orphans"));
}
DeleteOrphanFiles.Result result = action.execute();
return toOutputRows(result);
});
}
use of org.apache.iceberg.actions.DeleteOrphanFiles in project iceberg by apache.
the class TestDeleteReachableFilesAction method testIgnoreMetadataFilesNotFound.
@Test
public void testIgnoreMetadataFilesNotFound() {
table.updateProperties().set(TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, "1").commit();
table.newAppend().appendFile(FILE_A).commit();
// There are three metadata json files at this point
DeleteOrphanFiles.Result result = sparkActions().deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute();
Assert.assertEquals("Should delete 1 file", 1, Iterables.size(result.orphanFileLocations()));
Assert.assertTrue("Should remove v1 file", StreamSupport.stream(result.orphanFileLocations().spliterator(), false).anyMatch(file -> file.contains("v1.metadata.json")));
DeleteReachableFiles baseRemoveFilesSparkAction = sparkActions().deleteReachableFiles(metadataLocation(table)).io(table.io());
DeleteReachableFiles.Result res = baseRemoveFilesSparkAction.execute();
checkRemoveFilesResults(1, 1, 1, 4, res);
}
use of org.apache.iceberg.actions.DeleteOrphanFiles in project iceberg by apache.
the class TestRemoveOrphanFilesAction method orphanedFileRemovedWithParallelTasks.
@Test
public void orphanedFileRemovedWithParallelTasks() throws InterruptedException, IOException {
Table table = TABLES.create(SCHEMA, SPEC, Maps.newHashMap(), tableLocation);
List<ThreeColumnRecord> records1 = Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"));
Dataset<Row> df1 = spark.createDataFrame(records1, ThreeColumnRecord.class).coalesce(1);
// original append
df1.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation);
List<ThreeColumnRecord> records2 = Lists.newArrayList(new ThreeColumnRecord(2, "AAAAAAAAAA", "AAAA"));
Dataset<Row> df2 = spark.createDataFrame(records2, ThreeColumnRecord.class).coalesce(1);
// dynamic partition overwrite
df2.select("c1", "c2", "c3").write().format("iceberg").mode("overwrite").save(tableLocation);
// second append
df2.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation);
df2.coalesce(1).write().mode("append").parquet(tableLocation + "/data");
df2.coalesce(1).write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA");
df2.coalesce(1).write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA/c3=AAAA");
df2.coalesce(1).write().mode("append").parquet(tableLocation + "/data/invalid/invalid");
// sleep for 1 second to unsure files will be old enough
Thread.sleep(1000);
Set<String> deletedFiles = Sets.newHashSet();
Set<String> deleteThreads = ConcurrentHashMap.newKeySet();
AtomicInteger deleteThreadsIndex = new AtomicInteger(0);
ExecutorService executorService = Executors.newFixedThreadPool(4, runnable -> {
Thread thread = new Thread(runnable);
thread.setName("remove-orphan-" + deleteThreadsIndex.getAndIncrement());
thread.setDaemon(true);
return thread;
});
DeleteOrphanFiles.Result result = SparkActions.get().deleteOrphanFiles(table).executeDeleteWith(executorService).olderThan(System.currentTimeMillis()).deleteWith(file -> {
deleteThreads.add(Thread.currentThread().getName());
deletedFiles.add(file);
}).execute();
// Verifies that the delete methods ran in the threads created by the provided ExecutorService ThreadFactory
Assert.assertEquals(deleteThreads, Sets.newHashSet("remove-orphan-0", "remove-orphan-1", "remove-orphan-2", "remove-orphan-3"));
Assert.assertEquals("Should delete 4 files", 4, deletedFiles.size());
}
use of org.apache.iceberg.actions.DeleteOrphanFiles in project iceberg by apache.
the class TestRemoveOrphanFilesAction3 method testSparkCatalogNamedHiveTable.
@Test
public void testSparkCatalogNamedHiveTable() throws Exception {
spark.conf().set("spark.sql.catalog.hive", "org.apache.iceberg.spark.SparkCatalog");
spark.conf().set("spark.sql.catalog.hive.type", "hadoop");
spark.conf().set("spark.sql.catalog.hive.warehouse", tableLocation);
SparkCatalog cat = (SparkCatalog) spark.sessionState().catalogManager().catalog("hive");
String[] database = { "default" };
Identifier id = Identifier.of(database, "table");
Map<String, String> options = Maps.newHashMap();
Transform[] transforms = {};
cat.createTable(id, SparkSchemaUtil.convert(SCHEMA), transforms, options);
SparkTable table = cat.loadTable(id);
spark.sql("INSERT INTO hive.default.table VALUES (1,1,1)");
String location = table.table().location().replaceFirst("file:", "");
new File(location + "/data/trashfile").createNewFile();
DeleteOrphanFiles.Result results = SparkActions.get().deleteOrphanFiles(table.table()).olderThan(System.currentTimeMillis() + 1000).execute();
Assert.assertTrue("trash file should be removed", StreamSupport.stream(results.orphanFileLocations().spliterator(), false).anyMatch(file -> file.contains("file:" + location + "/data/trashfile")));
}
use of org.apache.iceberg.actions.DeleteOrphanFiles in project iceberg by apache.
the class TestRemoveOrphanFilesAction3 method testSparkSessionCatalogHiveTable.
@Test
public void testSparkSessionCatalogHiveTable() throws Exception {
spark.conf().set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog");
spark.conf().set("spark.sql.catalog.spark_catalog.type", "hive");
SparkSessionCatalog cat = (SparkSessionCatalog) spark.sessionState().catalogManager().v2SessionCatalog();
String[] database = { "default" };
Identifier id = Identifier.of(database, "sessioncattest");
Map<String, String> options = Maps.newHashMap();
Transform[] transforms = {};
cat.dropTable(id);
cat.createTable(id, SparkSchemaUtil.convert(SCHEMA), transforms, options);
SparkTable table = (SparkTable) cat.loadTable(id);
spark.sql("INSERT INTO default.sessioncattest VALUES (1,1,1)");
String location = table.table().location().replaceFirst("file:", "");
new File(location + "/data/trashfile").createNewFile();
DeleteOrphanFiles.Result results = SparkActions.get().deleteOrphanFiles(table.table()).olderThan(System.currentTimeMillis() + 1000).execute();
Assert.assertTrue("trash file should be removed", StreamSupport.stream(results.orphanFileLocations().spliterator(), false).anyMatch(file -> file.contains("file:" + location + "/data/trashfile")));
}
Aggregations