use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.
the class TestRemoveOrphanFilesAction method testGarbageCollectionDisabled.
@Test
public void testGarbageCollectionDisabled() {
Table table = TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation);
List<ThreeColumnRecord> records = Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"));
Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1);
df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation);
table.updateProperties().set(TableProperties.GC_ENABLED, "false").commit();
AssertHelpers.assertThrows("Should complain about removing orphan files", ValidationException.class, "Cannot remove orphan files: GC is disabled", () -> SparkActions.get().deleteOrphanFiles(table).execute());
}
use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.
the class TestScanTaskSerialization method prepareBaseCombinedScanTaskForSerDeTest.
private BaseCombinedScanTask prepareBaseCombinedScanTaskForSerDeTest() {
PartitionSpec spec = PartitionSpec.unpartitioned();
Map<String, String> options = Maps.newHashMap();
Table table = TABLES.create(SCHEMA, spec, options, tableLocation);
List<ThreeColumnRecord> records1 = Lists.newArrayList(new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"));
writeRecords(records1);
List<ThreeColumnRecord> records2 = Lists.newArrayList(new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD"));
writeRecords(records2);
table.refresh();
CloseableIterable<FileScanTask> tasks = table.newScan().planFiles();
return new BaseCombinedScanTask(Lists.newArrayList(tasks));
}
use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.
the class TestRewriteManifestsAction method testRewriteSmallManifestsNonPartitionedTable.
@Test
public void testRewriteSmallManifestsNonPartitionedTable() {
PartitionSpec spec = PartitionSpec.unpartitioned();
Map<String, String> options = Maps.newHashMap();
options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled);
Table table = TABLES.create(SCHEMA, spec, options, tableLocation);
List<ThreeColumnRecord> records1 = Lists.newArrayList(new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"));
writeRecords(records1);
List<ThreeColumnRecord> records2 = Lists.newArrayList(new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD"));
writeRecords(records2);
table.refresh();
List<ManifestFile> manifests = table.currentSnapshot().allManifests();
Assert.assertEquals("Should have 2 manifests before rewrite", 2, manifests.size());
SparkActions actions = SparkActions.get();
RewriteManifests.Result result = actions.rewriteManifests(table).rewriteIf(manifest -> true).execute();
Assert.assertEquals("Action should rewrite 2 manifests", 2, Iterables.size(result.rewrittenManifests()));
Assert.assertEquals("Action should add 1 manifests", 1, Iterables.size(result.addedManifests()));
table.refresh();
List<ManifestFile> newManifests = table.currentSnapshot().allManifests();
Assert.assertEquals("Should have 1 manifests after rewrite", 1, newManifests.size());
Assert.assertEquals(4, (long) newManifests.get(0).existingFilesCount());
Assert.assertFalse(newManifests.get(0).hasAddedFiles());
Assert.assertFalse(newManifests.get(0).hasDeletedFiles());
List<ThreeColumnRecord> expectedRecords = Lists.newArrayList();
expectedRecords.addAll(records1);
expectedRecords.addAll(records2);
Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
List<ThreeColumnRecord> actualRecords = resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList();
Assert.assertEquals("Rows must match", expectedRecords, actualRecords);
}
use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.
the class TestRewriteManifestsAction method testRewriteImportedManifests.
@Test
public void testRewriteImportedManifests() throws IOException {
PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c3").build();
Map<String, String> options = Maps.newHashMap();
options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled);
Table table = TABLES.create(SCHEMA, spec, options, tableLocation);
List<ThreeColumnRecord> records = Lists.newArrayList(new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"));
File parquetTableDir = temp.newFolder("parquet_table");
String parquetTableLocation = parquetTableDir.toURI().toString();
try {
Dataset<Row> inputDF = spark.createDataFrame(records, ThreeColumnRecord.class);
inputDF.select("c1", "c2", "c3").write().format("parquet").mode("overwrite").option("path", parquetTableLocation).partitionBy("c3").saveAsTable("parquet_table");
File stagingDir = temp.newFolder("staging-dir");
SparkTableUtil.importSparkTable(spark, new TableIdentifier("parquet_table"), table, stagingDir.toString());
Snapshot snapshot = table.currentSnapshot();
SparkActions actions = SparkActions.get();
RewriteManifests.Result result = actions.rewriteManifests(table).rewriteIf(manifest -> true).stagingLocation(temp.newFolder().toString()).execute();
Assert.assertEquals("Action should rewrite all manifests", snapshot.allManifests(), result.rewrittenManifests());
Assert.assertEquals("Action should add 1 manifest", 1, Iterables.size(result.addedManifests()));
} finally {
spark.sql("DROP TABLE parquet_table");
}
}
Aggregations