Search in sources :

Example 31 with ThreeColumnRecord

use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.

the class TestRemoveOrphanFilesAction method testGarbageCollectionDisabled.

@Test
public void testGarbageCollectionDisabled() {
    Table table = TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation);
    List<ThreeColumnRecord> records = Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"));
    Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1);
    df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation);
    table.updateProperties().set(TableProperties.GC_ENABLED, "false").commit();
    AssertHelpers.assertThrows("Should complain about removing orphan files", ValidationException.class, "Cannot remove orphan files: GC is disabled", () -> SparkActions.get().deleteOrphanFiles(table).execute());
}
Also used : Table(org.apache.iceberg.Table) Row(org.apache.spark.sql.Row) ThreeColumnRecord(org.apache.iceberg.spark.source.ThreeColumnRecord) Test(org.junit.Test)

Example 32 with ThreeColumnRecord

use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.

the class TestScanTaskSerialization method prepareBaseCombinedScanTaskForSerDeTest.

private BaseCombinedScanTask prepareBaseCombinedScanTaskForSerDeTest() {
    PartitionSpec spec = PartitionSpec.unpartitioned();
    Map<String, String> options = Maps.newHashMap();
    Table table = TABLES.create(SCHEMA, spec, options, tableLocation);
    List<ThreeColumnRecord> records1 = Lists.newArrayList(new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"));
    writeRecords(records1);
    List<ThreeColumnRecord> records2 = Lists.newArrayList(new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD"));
    writeRecords(records2);
    table.refresh();
    CloseableIterable<FileScanTask> tasks = table.newScan().planFiles();
    return new BaseCombinedScanTask(Lists.newArrayList(tasks));
}
Also used : ThreeColumnRecord(org.apache.iceberg.spark.source.ThreeColumnRecord)

Example 33 with ThreeColumnRecord

use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.

the class TestRewriteManifestsAction method testRewriteSmallManifestsNonPartitionedTable.

@Test
public void testRewriteSmallManifestsNonPartitionedTable() {
    PartitionSpec spec = PartitionSpec.unpartitioned();
    Map<String, String> options = Maps.newHashMap();
    options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled);
    Table table = TABLES.create(SCHEMA, spec, options, tableLocation);
    List<ThreeColumnRecord> records1 = Lists.newArrayList(new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"));
    writeRecords(records1);
    List<ThreeColumnRecord> records2 = Lists.newArrayList(new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD"));
    writeRecords(records2);
    table.refresh();
    List<ManifestFile> manifests = table.currentSnapshot().allManifests();
    Assert.assertEquals("Should have 2 manifests before rewrite", 2, manifests.size());
    SparkActions actions = SparkActions.get();
    RewriteManifests.Result result = actions.rewriteManifests(table).rewriteIf(manifest -> true).execute();
    Assert.assertEquals("Action should rewrite 2 manifests", 2, Iterables.size(result.rewrittenManifests()));
    Assert.assertEquals("Action should add 1 manifests", 1, Iterables.size(result.addedManifests()));
    table.refresh();
    List<ManifestFile> newManifests = table.currentSnapshot().allManifests();
    Assert.assertEquals("Should have 1 manifests after rewrite", 1, newManifests.size());
    Assert.assertEquals(4, (long) newManifests.get(0).existingFilesCount());
    Assert.assertFalse(newManifests.get(0).hasAddedFiles());
    Assert.assertFalse(newManifests.get(0).hasDeletedFiles());
    List<ThreeColumnRecord> expectedRecords = Lists.newArrayList();
    expectedRecords.addAll(records1);
    expectedRecords.addAll(records2);
    Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
    List<ThreeColumnRecord> actualRecords = resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList();
    Assert.assertEquals("Rows must match", expectedRecords, actualRecords);
}
Also used : Types(org.apache.iceberg.types.Types) Dataset(org.apache.spark.sql.Dataset) RunWith(org.junit.runner.RunWith) NestedField.optional(org.apache.iceberg.types.Types.NestedField.optional) ManifestFile(org.apache.iceberg.ManifestFile) Lists(org.apache.iceberg.relocated.com.google.common.collect.Lists) Map(java.util.Map) Configuration(org.apache.hadoop.conf.Configuration) Parameterized(org.junit.runners.Parameterized) ThreeColumnRecord(org.apache.iceberg.spark.source.ThreeColumnRecord) Before(org.junit.Before) RewriteManifests(org.apache.iceberg.actions.RewriteManifests) Table(org.apache.iceberg.Table) HadoopTables(org.apache.iceberg.hadoop.HadoopTables) Maps(org.apache.iceberg.relocated.com.google.common.collect.Maps) IOException(java.io.IOException) Iterables(org.apache.iceberg.relocated.com.google.common.collect.Iterables) Test(org.junit.Test) SparkTableUtil(org.apache.iceberg.spark.SparkTableUtil) Row(org.apache.spark.sql.Row) Schema(org.apache.iceberg.Schema) TableIdentifier(org.apache.spark.sql.catalyst.TableIdentifier) File(java.io.File) Encoders(org.apache.spark.sql.Encoders) List(java.util.List) Rule(org.junit.Rule) PartitionSpec(org.apache.iceberg.PartitionSpec) TableProperties(org.apache.iceberg.TableProperties) Assert(org.junit.Assert) SparkTestBase(org.apache.iceberg.spark.SparkTestBase) TemporaryFolder(org.junit.rules.TemporaryFolder) Snapshot(org.apache.iceberg.Snapshot) Table(org.apache.iceberg.Table) ThreeColumnRecord(org.apache.iceberg.spark.source.ThreeColumnRecord) PartitionSpec(org.apache.iceberg.PartitionSpec) ManifestFile(org.apache.iceberg.ManifestFile) RewriteManifests(org.apache.iceberg.actions.RewriteManifests) Row(org.apache.spark.sql.Row) Test(org.junit.Test)

Example 34 with ThreeColumnRecord

use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.

the class TestRewriteManifestsAction method testRewriteImportedManifests.

@Test
public void testRewriteImportedManifests() throws IOException {
    PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c3").build();
    Map<String, String> options = Maps.newHashMap();
    options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled);
    Table table = TABLES.create(SCHEMA, spec, options, tableLocation);
    List<ThreeColumnRecord> records = Lists.newArrayList(new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"));
    File parquetTableDir = temp.newFolder("parquet_table");
    String parquetTableLocation = parquetTableDir.toURI().toString();
    try {
        Dataset<Row> inputDF = spark.createDataFrame(records, ThreeColumnRecord.class);
        inputDF.select("c1", "c2", "c3").write().format("parquet").mode("overwrite").option("path", parquetTableLocation).partitionBy("c3").saveAsTable("parquet_table");
        File stagingDir = temp.newFolder("staging-dir");
        SparkTableUtil.importSparkTable(spark, new TableIdentifier("parquet_table"), table, stagingDir.toString());
        Snapshot snapshot = table.currentSnapshot();
        SparkActions actions = SparkActions.get();
        RewriteManifests.Result result = actions.rewriteManifests(table).rewriteIf(manifest -> true).stagingLocation(temp.newFolder().toString()).execute();
        Assert.assertEquals("Action should rewrite all manifests", snapshot.allManifests(), result.rewrittenManifests());
        Assert.assertEquals("Action should add 1 manifest", 1, Iterables.size(result.addedManifests()));
    } finally {
        spark.sql("DROP TABLE parquet_table");
    }
}
Also used : TableIdentifier(org.apache.spark.sql.catalyst.TableIdentifier) Table(org.apache.iceberg.Table) ThreeColumnRecord(org.apache.iceberg.spark.source.ThreeColumnRecord) PartitionSpec(org.apache.iceberg.PartitionSpec) Snapshot(org.apache.iceberg.Snapshot) RewriteManifests(org.apache.iceberg.actions.RewriteManifests) Row(org.apache.spark.sql.Row) ManifestFile(org.apache.iceberg.ManifestFile) File(java.io.File) Test(org.junit.Test)

Aggregations

ThreeColumnRecord (org.apache.iceberg.spark.source.ThreeColumnRecord)34 Row (org.apache.spark.sql.Row)33 Test (org.junit.Test)32 Table (org.apache.iceberg.Table)26 PartitionSpec (org.apache.iceberg.PartitionSpec)18 DeleteOrphanFiles (org.apache.iceberg.actions.DeleteOrphanFiles)12 File (java.io.File)11 Configuration (org.apache.hadoop.conf.Configuration)11 List (java.util.List)10 Map (java.util.Map)10 Schema (org.apache.iceberg.Schema)10 Snapshot (org.apache.iceberg.Snapshot)10 TableProperties (org.apache.iceberg.TableProperties)10 HadoopTables (org.apache.iceberg.hadoop.HadoopTables)10 Lists (org.apache.iceberg.relocated.com.google.common.collect.Lists)10 Maps (org.apache.iceberg.relocated.com.google.common.collect.Maps)10 SparkTestBase (org.apache.iceberg.spark.SparkTestBase)10 Types (org.apache.iceberg.types.Types)10 NestedField.optional (org.apache.iceberg.types.Types.NestedField.optional)10 Dataset (org.apache.spark.sql.Dataset)10