use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.
the class TestRemoveOrphanFilesAction method testDryRun.
@Test
public void testDryRun() throws IOException, InterruptedException {
Table table = TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation);
List<ThreeColumnRecord> records = Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"));
Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1);
df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation);
df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation);
List<String> validFiles = spark.read().format("iceberg").load(tableLocation + "#files").select("file_path").as(Encoders.STRING()).collectAsList();
Assert.assertEquals("Should be 2 valid files", 2, validFiles.size());
df.write().mode("append").parquet(tableLocation + "/data");
Path dataPath = new Path(tableLocation + "/data");
FileSystem fs = dataPath.getFileSystem(spark.sessionState().newHadoopConf());
List<String> allFiles = Arrays.stream(fs.listStatus(dataPath, HiddenPathFilter.get())).filter(FileStatus::isFile).map(file -> file.getPath().toString()).collect(Collectors.toList());
Assert.assertEquals("Should be 3 files", 3, allFiles.size());
List<String> invalidFiles = Lists.newArrayList(allFiles);
invalidFiles.removeAll(validFiles);
Assert.assertEquals("Should be 1 invalid file", 1, invalidFiles.size());
// sleep for 1 second to unsure files will be old enough
Thread.sleep(1000);
SparkActions actions = SparkActions.get();
DeleteOrphanFiles.Result result1 = actions.deleteOrphanFiles(table).deleteWith(s -> {
}).execute();
Assert.assertTrue("Default olderThan interval should be safe", Iterables.isEmpty(result1.orphanFileLocations()));
DeleteOrphanFiles.Result result2 = actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).deleteWith(s -> {
}).execute();
Assert.assertEquals("Action should find 1 file", invalidFiles, result2.orphanFileLocations());
Assert.assertTrue("Invalid file should be present", fs.exists(new Path(invalidFiles.get(0))));
DeleteOrphanFiles.Result result3 = actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute();
Assert.assertEquals("Action should delete 1 file", invalidFiles, result3.orphanFileLocations());
Assert.assertFalse("Invalid file should not be present", fs.exists(new Path(invalidFiles.get(0))));
List<ThreeColumnRecord> expectedRecords = Lists.newArrayList();
expectedRecords.addAll(records);
expectedRecords.addAll(records);
Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
List<ThreeColumnRecord> actualRecords = resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList();
Assert.assertEquals("Rows must match", expectedRecords, actualRecords);
}
use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.
the class TestRemoveOrphanFilesAction method testManyTopLevelPartitions.
@Test
public void testManyTopLevelPartitions() throws InterruptedException {
Table table = TABLES.create(SCHEMA, SPEC, Maps.newHashMap(), tableLocation);
List<ThreeColumnRecord> records = Lists.newArrayList();
for (int i = 0; i < 100; i++) {
records.add(new ThreeColumnRecord(i, String.valueOf(i), String.valueOf(i)));
}
Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.class);
df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation);
// sleep for 1 second to unsure files will be old enough
Thread.sleep(1000);
SparkActions actions = SparkActions.get();
DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute();
Assert.assertTrue("Should not delete any files", Iterables.isEmpty(result.orphanFileLocations()));
Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
List<ThreeColumnRecord> actualRecords = resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList();
Assert.assertEquals("Rows must match", records, actualRecords);
}
use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.
the class TestRewriteDataFilesAction method writeRecords.
private void writeRecords(int files, int numRecords, int partitions) {
List<ThreeColumnRecord> records = Lists.newArrayList();
int rowDimension = (int) Math.ceil(Math.sqrt(numRecords));
List<Pair<Integer, Integer>> data = IntStream.range(0, rowDimension).boxed().flatMap(x -> IntStream.range(0, rowDimension).boxed().map(y -> Pair.of(x, y))).collect(Collectors.toList());
Collections.shuffle(data, new Random(42));
if (partitions > 0) {
data.forEach(i -> records.add(new ThreeColumnRecord(i.first() % partitions, "foo" + i.first(), "bar" + i.second())));
} else {
data.forEach(i -> records.add(new ThreeColumnRecord(i.first(), "foo" + i.first(), "bar" + i.second())));
}
Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.class).repartition(files);
writeDF(df);
}
use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.
the class TestRewriteManifestsAction method testRewriteSmallManifestsPartitionedTable.
@Test
public void testRewriteSmallManifestsPartitionedTable() {
PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c1").truncate("c2", 2).build();
Map<String, String> options = Maps.newHashMap();
options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled);
Table table = TABLES.create(SCHEMA, spec, options, tableLocation);
List<ThreeColumnRecord> records1 = Lists.newArrayList(new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"));
writeRecords(records1);
List<ThreeColumnRecord> records2 = Lists.newArrayList(new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD"));
writeRecords(records2);
List<ThreeColumnRecord> records3 = Lists.newArrayList(new ThreeColumnRecord(3, "EEEEEEEEEE", "EEEE"), new ThreeColumnRecord(3, "FFFFFFFFFF", "FFFF"));
writeRecords(records3);
List<ThreeColumnRecord> records4 = Lists.newArrayList(new ThreeColumnRecord(4, "GGGGGGGGGG", "GGGG"), new ThreeColumnRecord(4, "HHHHHHHHHG", "HHHH"));
writeRecords(records4);
table.refresh();
List<ManifestFile> manifests = table.currentSnapshot().allManifests();
Assert.assertEquals("Should have 4 manifests before rewrite", 4, manifests.size());
SparkActions actions = SparkActions.get();
// we will expect to have 2 manifests with 4 entries in each after rewrite
long manifestEntrySizeBytes = computeManifestEntrySizeBytes(manifests);
long targetManifestSizeBytes = (long) (1.05 * 4 * manifestEntrySizeBytes);
table.updateProperties().set(TableProperties.MANIFEST_TARGET_SIZE_BYTES, String.valueOf(targetManifestSizeBytes)).commit();
RewriteManifests.Result result = actions.rewriteManifests(table).rewriteIf(manifest -> true).execute();
Assert.assertEquals("Action should rewrite 4 manifests", 4, Iterables.size(result.rewrittenManifests()));
Assert.assertEquals("Action should add 2 manifests", 2, Iterables.size(result.addedManifests()));
table.refresh();
List<ManifestFile> newManifests = table.currentSnapshot().allManifests();
Assert.assertEquals("Should have 2 manifests after rewrite", 2, newManifests.size());
Assert.assertEquals(4, (long) newManifests.get(0).existingFilesCount());
Assert.assertFalse(newManifests.get(0).hasAddedFiles());
Assert.assertFalse(newManifests.get(0).hasDeletedFiles());
Assert.assertEquals(4, (long) newManifests.get(1).existingFilesCount());
Assert.assertFalse(newManifests.get(1).hasAddedFiles());
Assert.assertFalse(newManifests.get(1).hasDeletedFiles());
List<ThreeColumnRecord> expectedRecords = Lists.newArrayList();
expectedRecords.addAll(records1);
expectedRecords.addAll(records2);
expectedRecords.addAll(records3);
expectedRecords.addAll(records4);
Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
List<ThreeColumnRecord> actualRecords = resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList();
Assert.assertEquals("Rows must match", expectedRecords, actualRecords);
}
use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.
the class TestRewriteManifestsAction method testRewriteLargeManifestsPartitionedTable.
@Test
public void testRewriteLargeManifestsPartitionedTable() throws IOException {
PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c3").build();
Map<String, String> options = Maps.newHashMap();
options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled);
Table table = TABLES.create(SCHEMA, spec, options, tableLocation);
// all records belong to the same partition
List<ThreeColumnRecord> records = Lists.newArrayList();
for (int i = 0; i < 50; i++) {
records.add(new ThreeColumnRecord(i, String.valueOf(i), "0"));
}
Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.class);
// repartition to create separate files
writeDF(df.repartition(50, df.col("c1")));
table.refresh();
List<ManifestFile> manifests = table.currentSnapshot().allManifests();
Assert.assertEquals("Should have 1 manifests before rewrite", 1, manifests.size());
// set the target manifest size to a small value to force splitting records into multiple files
table.updateProperties().set(TableProperties.MANIFEST_TARGET_SIZE_BYTES, String.valueOf(manifests.get(0).length() / 2)).commit();
SparkActions actions = SparkActions.get();
RewriteManifests.Result result = actions.rewriteManifests(table).rewriteIf(manifest -> true).stagingLocation(temp.newFolder().toString()).execute();
Assert.assertEquals("Action should rewrite 1 manifest", 1, Iterables.size(result.rewrittenManifests()));
Assert.assertEquals("Action should add 2 manifests", 2, Iterables.size(result.addedManifests()));
table.refresh();
List<ManifestFile> newManifests = table.currentSnapshot().allManifests();
Assert.assertEquals("Should have 2 manifests after rewrite", 2, newManifests.size());
Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
List<ThreeColumnRecord> actualRecords = resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList();
Assert.assertEquals("Rows must match", records, actualRecords);
}
Aggregations