Search in sources :

Example 16 with ThreeColumnRecord

use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.

the class TestRewriteManifestsAction method testRewriteManifestsWithPredicate.

@Test
public void testRewriteManifestsWithPredicate() throws IOException {
    PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c1").truncate("c2", 2).build();
    Map<String, String> options = Maps.newHashMap();
    options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled);
    Table table = TABLES.create(SCHEMA, spec, options, tableLocation);
    List<ThreeColumnRecord> records1 = Lists.newArrayList(new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"));
    writeRecords(records1);
    List<ThreeColumnRecord> records2 = Lists.newArrayList(new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD"));
    writeRecords(records2);
    table.refresh();
    List<ManifestFile> manifests = table.currentSnapshot().allManifests();
    Assert.assertEquals("Should have 2 manifests before rewrite", 2, manifests.size());
    SparkActions actions = SparkActions.get();
    // rewrite only the first manifest without caching
    RewriteManifests.Result result = actions.rewriteManifests(table).rewriteIf(manifest -> manifest.path().equals(manifests.get(0).path())).stagingLocation(temp.newFolder().toString()).option("use-caching", "false").execute();
    Assert.assertEquals("Action should rewrite 1 manifest", 1, Iterables.size(result.rewrittenManifests()));
    Assert.assertEquals("Action should add 1 manifests", 1, Iterables.size(result.addedManifests()));
    table.refresh();
    List<ManifestFile> newManifests = table.currentSnapshot().allManifests();
    Assert.assertEquals("Should have 2 manifests after rewrite", 2, newManifests.size());
    Assert.assertFalse("First manifest must be rewritten", newManifests.contains(manifests.get(0)));
    Assert.assertTrue("Second manifest must not be rewritten", newManifests.contains(manifests.get(1)));
    List<ThreeColumnRecord> expectedRecords = Lists.newArrayList();
    expectedRecords.addAll(records1);
    expectedRecords.addAll(records2);
    Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
    List<ThreeColumnRecord> actualRecords = resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList();
    Assert.assertEquals("Rows must match", expectedRecords, actualRecords);
}
Also used : Types(org.apache.iceberg.types.Types) Dataset(org.apache.spark.sql.Dataset) RunWith(org.junit.runner.RunWith) NestedField.optional(org.apache.iceberg.types.Types.NestedField.optional) ManifestFile(org.apache.iceberg.ManifestFile) Lists(org.apache.iceberg.relocated.com.google.common.collect.Lists) Map(java.util.Map) Configuration(org.apache.hadoop.conf.Configuration) Parameterized(org.junit.runners.Parameterized) ThreeColumnRecord(org.apache.iceberg.spark.source.ThreeColumnRecord) Before(org.junit.Before) RewriteManifests(org.apache.iceberg.actions.RewriteManifests) Table(org.apache.iceberg.Table) HadoopTables(org.apache.iceberg.hadoop.HadoopTables) Maps(org.apache.iceberg.relocated.com.google.common.collect.Maps) IOException(java.io.IOException) Iterables(org.apache.iceberg.relocated.com.google.common.collect.Iterables) Test(org.junit.Test) SparkTableUtil(org.apache.iceberg.spark.SparkTableUtil) Row(org.apache.spark.sql.Row) Schema(org.apache.iceberg.Schema) TableIdentifier(org.apache.spark.sql.catalyst.TableIdentifier) File(java.io.File) Encoders(org.apache.spark.sql.Encoders) List(java.util.List) Rule(org.junit.Rule) PartitionSpec(org.apache.iceberg.PartitionSpec) TableProperties(org.apache.iceberg.TableProperties) Assert(org.junit.Assert) SparkTestBase(org.apache.iceberg.spark.SparkTestBase) TemporaryFolder(org.junit.rules.TemporaryFolder) Snapshot(org.apache.iceberg.Snapshot) Table(org.apache.iceberg.Table) ThreeColumnRecord(org.apache.iceberg.spark.source.ThreeColumnRecord) PartitionSpec(org.apache.iceberg.PartitionSpec) ManifestFile(org.apache.iceberg.ManifestFile) RewriteManifests(org.apache.iceberg.actions.RewriteManifests) Row(org.apache.spark.sql.Row) Test(org.junit.Test)

Example 17 with ThreeColumnRecord

use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.

the class TestRewriteDataFilesAction method testRewriteDataFilesForLargeFile.

@Test
public void testRewriteDataFilesForLargeFile() throws AnalysisException {
    PartitionSpec spec = PartitionSpec.unpartitioned();
    Map<String, String> options = Maps.newHashMap();
    Table table = TABLES.create(SCHEMA, spec, options, tableLocation);
    Assert.assertNull("Table must be empty", table.currentSnapshot());
    List<ThreeColumnRecord> records1 = Lists.newArrayList();
    IntStream.range(0, 2000).forEach(i -> records1.add(new ThreeColumnRecord(i, "foo" + i, "bar" + i)));
    Dataset<Row> df = spark.createDataFrame(records1, ThreeColumnRecord.class).repartition(1);
    writeDF(df);
    List<ThreeColumnRecord> records2 = Lists.newArrayList(new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"), new ThreeColumnRecord(1, "DDDDDDDDDD", "DDDD"));
    writeRecords(records2);
    table.refresh();
    CloseableIterable<FileScanTask> tasks = table.newScan().planFiles();
    List<DataFile> dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file));
    DataFile maxSizeFile = Collections.max(dataFiles, Comparator.comparingLong(DataFile::fileSizeInBytes));
    Assert.assertEquals("Should have 3 files before rewrite", 3, dataFiles.size());
    spark.read().format("iceberg").load(tableLocation).createTempView("origin");
    long originalNumRecords = spark.read().format("iceberg").load(tableLocation).count();
    List<Object[]> originalRecords = sql("SELECT * from origin sort by c2");
    Actions actions = Actions.forTable(table);
    long targetSizeInBytes = maxSizeFile.fileSizeInBytes() - 10;
    RewriteDataFilesActionResult result = actions.rewriteDataFiles().targetSizeInBytes(targetSizeInBytes).splitOpenFileCost(1).execute();
    Assert.assertEquals("Action should delete 3 data files", 3, result.deletedDataFiles().size());
    Assert.assertEquals("Action should add 2 data files", 2, result.addedDataFiles().size());
    spark.read().format("iceberg").load(tableLocation).createTempView("postRewrite");
    long postRewriteNumRecords = spark.read().format("iceberg").load(tableLocation).count();
    List<Object[]> rewrittenRecords = sql("SELECT * from postRewrite sort by c2");
    Assert.assertEquals(originalNumRecords, postRewriteNumRecords);
    assertEquals("Rows should be unchanged", originalRecords, rewrittenRecords);
}
Also used : Table(org.apache.iceberg.Table) ThreeColumnRecord(org.apache.iceberg.spark.source.ThreeColumnRecord) PartitionSpec(org.apache.iceberg.PartitionSpec) DataFile(org.apache.iceberg.DataFile) Row(org.apache.spark.sql.Row) FileScanTask(org.apache.iceberg.FileScanTask) Test(org.junit.Test)

Example 18 with ThreeColumnRecord

use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.

the class TestRewriteDataFilesAction method testRewriteLargeTableHasResiduals.

@Test
public void testRewriteLargeTableHasResiduals() {
    PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).build();
    Map<String, String> options = Maps.newHashMap();
    options.put(TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES, "100");
    Table table = TABLES.create(SCHEMA, spec, options, tableLocation);
    // all records belong to the same partition
    List<ThreeColumnRecord> records = Lists.newArrayList();
    for (int i = 0; i < 100; i++) {
        records.add(new ThreeColumnRecord(i, String.valueOf(i), String.valueOf(i % 4)));
    }
    Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.class);
    writeDF(df);
    table.refresh();
    CloseableIterable<FileScanTask> tasks = table.newScan().ignoreResiduals().filter(Expressions.equal("c3", "0")).planFiles();
    for (FileScanTask task : tasks) {
        Assert.assertEquals("Residuals must be ignored", Expressions.alwaysTrue(), task.residual());
    }
    List<DataFile> dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file));
    Assert.assertEquals("Should have 2 data files before rewrite", 2, dataFiles.size());
    Actions actions = Actions.forTable(table);
    RewriteDataFilesActionResult result = actions.rewriteDataFiles().filter(Expressions.equal("c3", "0")).execute();
    Assert.assertEquals("Action should rewrite 2 data files", 2, result.deletedDataFiles().size());
    Assert.assertEquals("Action should add 1 data file", 1, result.addedDataFiles().size());
    table.refresh();
    Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
    List<ThreeColumnRecord> actualRecords = resultDF.sort("c1").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList();
    Assert.assertEquals("Rows must match", records, actualRecords);
}
Also used : Table(org.apache.iceberg.Table) ThreeColumnRecord(org.apache.iceberg.spark.source.ThreeColumnRecord) PartitionSpec(org.apache.iceberg.PartitionSpec) DataFile(org.apache.iceberg.DataFile) Row(org.apache.spark.sql.Row) FileScanTask(org.apache.iceberg.FileScanTask) Test(org.junit.Test)

Example 19 with ThreeColumnRecord

use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.

the class TestRewriteDataFilesAction method testRewriteToOutputPartitionSpec.

@Test
public void testRewriteToOutputPartitionSpec() {
    PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c1").build();
    Map<String, String> options = Maps.newHashMap();
    Table table = TABLES.create(SCHEMA, spec, options, tableLocation);
    table.updateSpec().addField(Expressions.truncate("c2", 2)).commit();
    Assert.assertEquals("Should have 2 partitions specs", 2, table.specs().size());
    List<ThreeColumnRecord> records1 = Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"), new ThreeColumnRecord(1, "AAAAAAAAAA", "CCCC"));
    writeRecords(records1);
    List<ThreeColumnRecord> records2 = Lists.newArrayList(new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"), new ThreeColumnRecord(1, "BBBBBBBBBB", "DDDD"));
    writeRecords(records2);
    List<ThreeColumnRecord> records3 = Lists.newArrayList(new ThreeColumnRecord(2, "AAAAAAAAAA", "EEEE"), new ThreeColumnRecord(2, "AAAAAAAAAA", "GGGG"));
    writeRecords(records3);
    List<ThreeColumnRecord> records4 = Lists.newArrayList(new ThreeColumnRecord(2, "BBBBBBBBBB", "FFFF"), new ThreeColumnRecord(2, "BBBBBBBBBB", "HHHH"));
    writeRecords(records4);
    table.refresh();
    CloseableIterable<FileScanTask> tasks = table.newScan().planFiles();
    List<DataFile> dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file));
    Assert.assertEquals("Should have 8 data files before rewrite", 8, dataFiles.size());
    Dataset<Row> beforeResultDF = spark.read().format("iceberg").load(tableLocation);
    List<ThreeColumnRecord> beforeActualFilteredRecords = beforeResultDF.sort("c1", "c2", "c3").filter("c1 = 1 AND c2 = 'BBBBBBBBBB'").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList();
    Assert.assertEquals("Rows must match", records2, beforeActualFilteredRecords);
    Actions actions = Actions.forTable(table);
    RewriteDataFilesActionResult result = actions.rewriteDataFiles().outputSpecId(0).execute();
    Assert.assertEquals("Action should rewrite 8 data files", 8, result.deletedDataFiles().size());
    Assert.assertEquals("Action should add 2 data file", 2, result.addedDataFiles().size());
    Assert.assertTrue(result.deletedDataFiles().stream().allMatch(df -> df.specId() == 1));
    Assert.assertTrue(result.addedDataFiles().stream().allMatch(df -> df.specId() == 0));
    table.refresh();
    CloseableIterable<FileScanTask> tasks2 = table.newScan().planFiles();
    List<DataFile> dataFiles2 = Lists.newArrayList(CloseableIterable.transform(tasks2, FileScanTask::file));
    Assert.assertEquals("Should have 2 data files after rewrite", 2, dataFiles2.size());
    // Should still have all the same data
    List<ThreeColumnRecord> expectedRecords = Lists.newArrayList();
    expectedRecords.addAll(records1);
    expectedRecords.addAll(records2);
    expectedRecords.addAll(records3);
    expectedRecords.addAll(records4);
    Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
    List<ThreeColumnRecord> actualRecords = resultDF.sort("c1", "c2", "c3").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList();
    Assert.assertEquals("Rows must match", expectedRecords, actualRecords);
    List<ThreeColumnRecord> actualFilteredRecords = resultDF.sort("c1", "c2", "c3").filter("c1 = 1 AND c2 = 'BBBBBBBBBB'").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList();
    Assert.assertEquals("Rows must match", records2, actualFilteredRecords);
    List<ThreeColumnRecord> records5 = Lists.newArrayList(new ThreeColumnRecord(3, "CCCCCCCCCC", "FFFF"), new ThreeColumnRecord(3, "CCCCCCCCCC", "HHHH"));
    writeRecords(records5);
    expectedRecords.addAll(records5);
    actualRecords = resultDF.sort("c1", "c2", "c3").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList();
    Assert.assertEquals("Rows must match", expectedRecords, actualRecords);
}
Also used : IntStream(java.util.stream.IntStream) AnalysisException(org.apache.spark.sql.AnalysisException) Types(org.apache.iceberg.types.Types) Dataset(org.apache.spark.sql.Dataset) NestedField.optional(org.apache.iceberg.types.Types.NestedField.optional) Lists(org.apache.iceberg.relocated.com.google.common.collect.Lists) Map(java.util.Map) Configuration(org.apache.hadoop.conf.Configuration) FileScanTask(org.apache.iceberg.FileScanTask) DataFile(org.apache.iceberg.DataFile) ThreeColumnRecord(org.apache.iceberg.spark.source.ThreeColumnRecord) Before(org.junit.Before) CloseableIterable(org.apache.iceberg.io.CloseableIterable) Table(org.apache.iceberg.Table) HadoopTables(org.apache.iceberg.hadoop.HadoopTables) Maps(org.apache.iceberg.relocated.com.google.common.collect.Maps) Test(org.junit.Test) Row(org.apache.spark.sql.Row) Schema(org.apache.iceberg.Schema) File(java.io.File) Encoders(org.apache.spark.sql.Encoders) List(java.util.List) Rule(org.junit.Rule) PartitionSpec(org.apache.iceberg.PartitionSpec) TableProperties(org.apache.iceberg.TableProperties) Comparator(java.util.Comparator) Expressions(org.apache.iceberg.expressions.Expressions) Assert(org.junit.Assert) Collections(java.util.Collections) SparkTestBase(org.apache.iceberg.spark.SparkTestBase) TemporaryFolder(org.junit.rules.TemporaryFolder) Table(org.apache.iceberg.Table) ThreeColumnRecord(org.apache.iceberg.spark.source.ThreeColumnRecord) PartitionSpec(org.apache.iceberg.PartitionSpec) DataFile(org.apache.iceberg.DataFile) Row(org.apache.spark.sql.Row) FileScanTask(org.apache.iceberg.FileScanTask) Test(org.junit.Test)

Example 20 with ThreeColumnRecord

use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.

the class TestRewriteDataFilesProcedure method insertData.

private void insertData(int filesCount) {
    ThreeColumnRecord record1 = new ThreeColumnRecord(1, "foo", null);
    ThreeColumnRecord record2 = new ThreeColumnRecord(2, "bar", null);
    List<ThreeColumnRecord> records = Lists.newArrayList();
    IntStream.range(0, filesCount / 2).forEach(i -> {
        records.add(record1);
        records.add(record2);
    });
    Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.class).repartition(filesCount);
    try {
        df.writeTo(tableName).append();
    } catch (org.apache.spark.sql.catalyst.analysis.NoSuchTableException e) {
        throw new RuntimeException(e);
    }
}
Also used : Row(org.apache.spark.sql.Row) ThreeColumnRecord(org.apache.iceberg.spark.source.ThreeColumnRecord)

Aggregations

ThreeColumnRecord (org.apache.iceberg.spark.source.ThreeColumnRecord)34 Row (org.apache.spark.sql.Row)33 Test (org.junit.Test)32 Table (org.apache.iceberg.Table)26 PartitionSpec (org.apache.iceberg.PartitionSpec)18 DeleteOrphanFiles (org.apache.iceberg.actions.DeleteOrphanFiles)12 File (java.io.File)11 Configuration (org.apache.hadoop.conf.Configuration)11 List (java.util.List)10 Map (java.util.Map)10 Schema (org.apache.iceberg.Schema)10 Snapshot (org.apache.iceberg.Snapshot)10 TableProperties (org.apache.iceberg.TableProperties)10 HadoopTables (org.apache.iceberg.hadoop.HadoopTables)10 Lists (org.apache.iceberg.relocated.com.google.common.collect.Lists)10 Maps (org.apache.iceberg.relocated.com.google.common.collect.Maps)10 SparkTestBase (org.apache.iceberg.spark.SparkTestBase)10 Types (org.apache.iceberg.types.Types)10 NestedField.optional (org.apache.iceberg.types.Types.NestedField.optional)10 Dataset (org.apache.spark.sql.Dataset)10