use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.
the class TestRemoveOrphanFilesAction method testManyLeafPartitions.
@Test
public void testManyLeafPartitions() throws InterruptedException {
Table table = TABLES.create(SCHEMA, SPEC, Maps.newHashMap(), tableLocation);
List<ThreeColumnRecord> records = Lists.newArrayList();
for (int i = 0; i < 100; i++) {
records.add(new ThreeColumnRecord(i, String.valueOf(i % 3), String.valueOf(i)));
}
Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.class);
df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation);
// sleep for 1 second to unsure files will be old enough
Thread.sleep(1000);
SparkActions actions = SparkActions.get();
DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute();
Assert.assertTrue("Should not delete any files", Iterables.isEmpty(result.orphanFileLocations()));
Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
List<ThreeColumnRecord> actualRecords = resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList();
Assert.assertEquals("Rows must match", records, actualRecords);
}
use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.
the class TestRemoveOrphanFilesAction method testAllValidFilesAreKept.
@Test
public void testAllValidFilesAreKept() throws IOException, InterruptedException {
Table table = TABLES.create(SCHEMA, SPEC, Maps.newHashMap(), tableLocation);
List<ThreeColumnRecord> records1 = Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"));
Dataset<Row> df1 = spark.createDataFrame(records1, ThreeColumnRecord.class).coalesce(1);
// original append
df1.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation);
List<ThreeColumnRecord> records2 = Lists.newArrayList(new ThreeColumnRecord(2, "AAAAAAAAAA", "AAAA"));
Dataset<Row> df2 = spark.createDataFrame(records2, ThreeColumnRecord.class).coalesce(1);
// dynamic partition overwrite
df2.select("c1", "c2", "c3").write().format("iceberg").mode("overwrite").save(tableLocation);
// second append
df2.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation);
List<Snapshot> snapshots = Lists.newArrayList(table.snapshots());
List<String> snapshotFiles1 = snapshotFiles(snapshots.get(0).snapshotId());
Assert.assertEquals(1, snapshotFiles1.size());
List<String> snapshotFiles2 = snapshotFiles(snapshots.get(1).snapshotId());
Assert.assertEquals(1, snapshotFiles2.size());
List<String> snapshotFiles3 = snapshotFiles(snapshots.get(2).snapshotId());
Assert.assertEquals(2, snapshotFiles3.size());
df2.coalesce(1).write().mode("append").parquet(tableLocation + "/data");
df2.coalesce(1).write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA");
df2.coalesce(1).write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA/c3=AAAA");
df2.coalesce(1).write().mode("append").parquet(tableLocation + "/data/invalid/invalid");
// sleep for 1 second to unsure files will be old enough
Thread.sleep(1000);
SparkActions actions = SparkActions.get();
DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute();
Assert.assertEquals("Should delete 4 files", 4, Iterables.size(result.orphanFileLocations()));
Path dataPath = new Path(tableLocation + "/data");
FileSystem fs = dataPath.getFileSystem(spark.sessionState().newHadoopConf());
for (String fileLocation : snapshotFiles1) {
Assert.assertTrue("All snapshot files must remain", fs.exists(new Path(fileLocation)));
}
for (String fileLocation : snapshotFiles2) {
Assert.assertTrue("All snapshot files must remain", fs.exists(new Path(fileLocation)));
}
for (String fileLocation : snapshotFiles3) {
Assert.assertTrue("All snapshot files must remain", fs.exists(new Path(fileLocation)));
}
}
use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.
the class TestRemoveOrphanFilesAction method testRemoveOrphanFilesWithHadoopCatalog.
@Test
public void testRemoveOrphanFilesWithHadoopCatalog() throws InterruptedException {
HadoopCatalog catalog = new HadoopCatalog(new Configuration(), tableLocation);
String namespaceName = "testDb";
String tableName = "testTb";
Namespace namespace = Namespace.of(namespaceName);
TableIdentifier tableIdentifier = TableIdentifier.of(namespace, tableName);
Table table = catalog.createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap());
List<ThreeColumnRecord> records = Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"));
Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1);
df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(table.location());
df.write().mode("append").parquet(table.location() + "/data");
// sleep for 1 second to unsure files will be old enough
Thread.sleep(1000);
table.refresh();
DeleteOrphanFiles.Result result = SparkActions.get().deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute();
Assert.assertEquals("Should delete only 1 files", 1, Iterables.size(result.orphanFileLocations()));
Dataset<Row> resultDF = spark.read().format("iceberg").load(table.location());
List<ThreeColumnRecord> actualRecords = resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList();
Assert.assertEquals("Rows must match", records, actualRecords);
}
use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.
the class TestRemoveOrphanFilesAction method testRemoveOrphanFilesWithRelativeFilePath.
@Test
public void testRemoveOrphanFilesWithRelativeFilePath() throws IOException, InterruptedException {
Table table = TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableDir.getAbsolutePath());
List<ThreeColumnRecord> records = Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"));
Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1);
df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableDir.getAbsolutePath());
List<String> validFiles = spark.read().format("iceberg").load(tableLocation + "#files").select("file_path").as(Encoders.STRING()).collectAsList();
Assert.assertEquals("Should be 1 valid files", 1, validFiles.size());
String validFile = validFiles.get(0);
df.write().mode("append").parquet(tableLocation + "/data");
Path dataPath = new Path(tableLocation + "/data");
FileSystem fs = dataPath.getFileSystem(spark.sessionState().newHadoopConf());
List<String> allFiles = Arrays.stream(fs.listStatus(dataPath, HiddenPathFilter.get())).filter(FileStatus::isFile).map(file -> file.getPath().toString()).collect(Collectors.toList());
Assert.assertEquals("Should be 2 files", 2, allFiles.size());
List<String> invalidFiles = Lists.newArrayList(allFiles);
invalidFiles.removeIf(file -> file.contains(validFile));
Assert.assertEquals("Should be 1 invalid file", 1, invalidFiles.size());
// sleep for 1 second to unsure files will be old enough
Thread.sleep(1000);
SparkActions actions = SparkActions.get();
DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).deleteWith(s -> {
}).execute();
Assert.assertEquals("Action should find 1 file", invalidFiles, result.orphanFileLocations());
Assert.assertTrue("Invalid file should be present", fs.exists(new Path(invalidFiles.get(0))));
}
use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.
the class TestRemoveOrphanFilesAction method testHiveCatalogTable.
@Test
public void testHiveCatalogTable() throws IOException {
Table table = catalog.createTable(TableIdentifier.of("default", "hivetestorphan"), SCHEMA, SPEC, tableLocation, Maps.newHashMap());
List<ThreeColumnRecord> records = Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"));
Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1);
df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save("default.hivetestorphan");
String location = table.location().replaceFirst("file:", "");
new File(location + "/data/trashfile").createNewFile();
DeleteOrphanFiles.Result result = SparkActions.get().deleteOrphanFiles(table).olderThan(System.currentTimeMillis() + 1000).execute();
Assert.assertTrue("trash file should be removed", StreamSupport.stream(result.orphanFileLocations().spliterator(), false).anyMatch(file -> file.contains("file:" + location + "data/trashfile")));
}
Aggregations