use of org.apache.iceberg.actions.DeleteOrphanFiles in project iceberg by apache.
the class TestRemoveOrphanFilesAction method testRemoveOrphanFilesWithHadoopCatalog.
@Test
public void testRemoveOrphanFilesWithHadoopCatalog() throws InterruptedException {
HadoopCatalog catalog = new HadoopCatalog(new Configuration(), tableLocation);
String namespaceName = "testDb";
String tableName = "testTb";
Namespace namespace = Namespace.of(namespaceName);
TableIdentifier tableIdentifier = TableIdentifier.of(namespace, tableName);
Table table = catalog.createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap());
List<ThreeColumnRecord> records = Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"));
Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1);
df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(table.location());
df.write().mode("append").parquet(table.location() + "/data");
// sleep for 1 second to unsure files will be old enough
Thread.sleep(1000);
table.refresh();
DeleteOrphanFiles.Result result = SparkActions.get().deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute();
Assert.assertEquals("Should delete only 1 files", 1, Iterables.size(result.orphanFileLocations()));
Dataset<Row> resultDF = spark.read().format("iceberg").load(table.location());
List<ThreeColumnRecord> actualRecords = resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList();
Assert.assertEquals("Rows must match", records, actualRecords);
}
use of org.apache.iceberg.actions.DeleteOrphanFiles in project iceberg by apache.
the class TestRemoveOrphanFilesAction method testHiveCatalogTable.
@Test
public void testHiveCatalogTable() throws IOException {
Table table = catalog.createTable(TableIdentifier.of("default", "hivetestorphan"), SCHEMA, SPEC, tableLocation, Maps.newHashMap());
List<ThreeColumnRecord> records = Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"));
Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1);
df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save("default.hivetestorphan");
String location = table.location().replaceFirst("file:", "");
new File(location + "/data/trashfile").createNewFile();
DeleteOrphanFiles.Result result = SparkActions.get().deleteOrphanFiles(table).olderThan(System.currentTimeMillis() + 1000).execute();
Assert.assertTrue("trash file should be removed", StreamSupport.stream(result.orphanFileLocations().spliterator(), false).anyMatch(file -> file.contains("file:" + location + "data/trashfile")));
}
use of org.apache.iceberg.actions.DeleteOrphanFiles in project iceberg by apache.
the class TestRemoveOrphanFilesAction3 method testSparkCatalogNamedHadoopTable.
@Test
public void testSparkCatalogNamedHadoopTable() throws Exception {
spark.conf().set("spark.sql.catalog.hadoop", "org.apache.iceberg.spark.SparkCatalog");
spark.conf().set("spark.sql.catalog.hadoop.type", "hadoop");
spark.conf().set("spark.sql.catalog.hadoop.warehouse", tableLocation);
SparkCatalog cat = (SparkCatalog) spark.sessionState().catalogManager().catalog("hadoop");
String[] database = { "default" };
Identifier id = Identifier.of(database, "table");
Map<String, String> options = Maps.newHashMap();
Transform[] transforms = {};
cat.createTable(id, SparkSchemaUtil.convert(SCHEMA), transforms, options);
SparkTable table = cat.loadTable(id);
spark.sql("INSERT INTO hadoop.default.table VALUES (1,1,1)");
String location = table.table().location().replaceFirst("file:", "");
new File(location + "/data/trashfile").createNewFile();
DeleteOrphanFiles.Result results = SparkActions.get().deleteOrphanFiles(table.table()).olderThan(System.currentTimeMillis() + 1000).execute();
Assert.assertTrue("trash file should be removed", StreamSupport.stream(results.orphanFileLocations().spliterator(), false).anyMatch(file -> file.contains("file:" + location + "/data/trashfile")));
}
use of org.apache.iceberg.actions.DeleteOrphanFiles in project iceberg by apache.
the class TestRemoveOrphanFilesAction3 method testSparkCatalogTable.
@Test
public void testSparkCatalogTable() throws Exception {
spark.conf().set("spark.sql.catalog.mycat", "org.apache.iceberg.spark.SparkCatalog");
spark.conf().set("spark.sql.catalog.mycat.type", "hadoop");
spark.conf().set("spark.sql.catalog.mycat.warehouse", tableLocation);
SparkCatalog cat = (SparkCatalog) spark.sessionState().catalogManager().catalog("mycat");
String[] database = { "default" };
Identifier id = Identifier.of(database, "table");
Map<String, String> options = Maps.newHashMap();
Transform[] transforms = {};
cat.createTable(id, SparkSchemaUtil.convert(SCHEMA), transforms, options);
SparkTable table = cat.loadTable(id);
spark.sql("INSERT INTO mycat.default.table VALUES (1,1,1)");
String location = table.table().location().replaceFirst("file:", "");
new File(location + "/data/trashfile").createNewFile();
DeleteOrphanFiles.Result results = SparkActions.get().deleteOrphanFiles(table.table()).olderThan(System.currentTimeMillis() + 1000).execute();
Assert.assertTrue("trash file should be removed", StreamSupport.stream(results.orphanFileLocations().spliterator(), false).anyMatch(file -> file.contains("file:" + location + "/data/trashfile")));
}
use of org.apache.iceberg.actions.DeleteOrphanFiles in project iceberg by apache.
the class TestRemoveOrphanFilesAction3 method testSparkSessionCatalogHadoopTable.
@Test
public void testSparkSessionCatalogHadoopTable() throws Exception {
spark.conf().set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog");
spark.conf().set("spark.sql.catalog.spark_catalog.type", "hadoop");
spark.conf().set("spark.sql.catalog.spark_catalog.warehouse", tableLocation);
SparkSessionCatalog cat = (SparkSessionCatalog) spark.sessionState().catalogManager().v2SessionCatalog();
String[] database = { "default" };
Identifier id = Identifier.of(database, "table");
Map<String, String> options = Maps.newHashMap();
Transform[] transforms = {};
cat.createTable(id, SparkSchemaUtil.convert(SCHEMA), transforms, options);
SparkTable table = (SparkTable) cat.loadTable(id);
spark.sql("INSERT INTO default.table VALUES (1,1,1)");
String location = table.table().location().replaceFirst("file:", "");
new File(location + "/data/trashfile").createNewFile();
DeleteOrphanFiles.Result results = SparkActions.get().deleteOrphanFiles(table.table()).olderThan(System.currentTimeMillis() + 1000).execute();
Assert.assertTrue("trash file should be removed", StreamSupport.stream(results.orphanFileLocations().spliterator(), false).anyMatch(file -> file.contains("file:" + location + "/data/trashfile")));
}
Aggregations