Search in sources :

Example 1 with ThreeColumnRecord

use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.

the class TestRewriteDataFilesAction method testRewriteDataFilesUnpartitionedTable.

@Test
public void testRewriteDataFilesUnpartitionedTable() {
    PartitionSpec spec = PartitionSpec.unpartitioned();
    Map<String, String> options = Maps.newHashMap();
    Table table = TABLES.create(SCHEMA, spec, options, tableLocation);
    List<ThreeColumnRecord> records1 = Lists.newArrayList(new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"));
    writeRecords(records1);
    List<ThreeColumnRecord> records2 = Lists.newArrayList(new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD"));
    writeRecords(records2);
    table.refresh();
    CloseableIterable<FileScanTask> tasks = table.newScan().planFiles();
    List<DataFile> dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file));
    Assert.assertEquals("Should have 4 data files before rewrite", 4, dataFiles.size());
    Actions actions = Actions.forTable(table);
    RewriteDataFilesActionResult result = actions.rewriteDataFiles().execute();
    Assert.assertEquals("Action should rewrite 4 data files", 4, result.deletedDataFiles().size());
    Assert.assertEquals("Action should add 1 data file", 1, result.addedDataFiles().size());
    table.refresh();
    CloseableIterable<FileScanTask> tasks1 = table.newScan().planFiles();
    List<DataFile> dataFiles1 = Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file));
    Assert.assertEquals("Should have 1 data files before rewrite", 1, dataFiles1.size());
    List<ThreeColumnRecord> expectedRecords = Lists.newArrayList();
    expectedRecords.addAll(records1);
    expectedRecords.addAll(records2);
    Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
    List<ThreeColumnRecord> actualRecords = resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList();
    Assert.assertEquals("Rows must match", expectedRecords, actualRecords);
}
Also used : Table(org.apache.iceberg.Table) ThreeColumnRecord(org.apache.iceberg.spark.source.ThreeColumnRecord) PartitionSpec(org.apache.iceberg.PartitionSpec) DataFile(org.apache.iceberg.DataFile) Row(org.apache.spark.sql.Row) FileScanTask(org.apache.iceberg.FileScanTask) Test(org.junit.Test)

Example 2 with ThreeColumnRecord

use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.

the class TestRewriteDataFilesAction method testRewriteDataFilesPartitionedTable.

@Test
public void testRewriteDataFilesPartitionedTable() {
    PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c1").truncate("c2", 2).build();
    Map<String, String> options = Maps.newHashMap();
    Table table = TABLES.create(SCHEMA, spec, options, tableLocation);
    List<ThreeColumnRecord> records1 = Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"), new ThreeColumnRecord(1, "AAAAAAAAAA", "CCCC"));
    writeRecords(records1);
    List<ThreeColumnRecord> records2 = Lists.newArrayList(new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"), new ThreeColumnRecord(1, "BBBBBBBBBB", "DDDD"));
    writeRecords(records2);
    List<ThreeColumnRecord> records3 = Lists.newArrayList(new ThreeColumnRecord(2, "AAAAAAAAAA", "EEEE"), new ThreeColumnRecord(2, "AAAAAAAAAA", "GGGG"));
    writeRecords(records3);
    List<ThreeColumnRecord> records4 = Lists.newArrayList(new ThreeColumnRecord(2, "BBBBBBBBBB", "FFFF"), new ThreeColumnRecord(2, "BBBBBBBBBB", "HHHH"));
    writeRecords(records4);
    table.refresh();
    CloseableIterable<FileScanTask> tasks = table.newScan().planFiles();
    List<DataFile> dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file));
    Assert.assertEquals("Should have 8 data files before rewrite", 8, dataFiles.size());
    Actions actions = Actions.forTable(table);
    RewriteDataFilesActionResult result = actions.rewriteDataFiles().execute();
    Assert.assertEquals("Action should rewrite 8 data files", 8, result.deletedDataFiles().size());
    Assert.assertEquals("Action should add 4 data file", 4, result.addedDataFiles().size());
    table.refresh();
    CloseableIterable<FileScanTask> tasks1 = table.newScan().planFiles();
    List<DataFile> dataFiles1 = Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file));
    Assert.assertEquals("Should have 4 data files before rewrite", 4, dataFiles1.size());
    List<ThreeColumnRecord> expectedRecords = Lists.newArrayList();
    expectedRecords.addAll(records1);
    expectedRecords.addAll(records2);
    expectedRecords.addAll(records3);
    expectedRecords.addAll(records4);
    Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
    List<ThreeColumnRecord> actualRecords = resultDF.sort("c1", "c2", "c3").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList();
    Assert.assertEquals("Rows must match", expectedRecords, actualRecords);
}
Also used : Table(org.apache.iceberg.Table) ThreeColumnRecord(org.apache.iceberg.spark.source.ThreeColumnRecord) PartitionSpec(org.apache.iceberg.PartitionSpec) DataFile(org.apache.iceberg.DataFile) Row(org.apache.spark.sql.Row) FileScanTask(org.apache.iceberg.FileScanTask) Test(org.junit.Test)

Example 3 with ThreeColumnRecord

use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.

the class TestRewriteDataFilesAction method testRewriteDataFilesWithFilter.

@Test
public void testRewriteDataFilesWithFilter() {
    PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c1").truncate("c2", 2).build();
    Map<String, String> options = Maps.newHashMap();
    Table table = TABLES.create(SCHEMA, spec, options, tableLocation);
    List<ThreeColumnRecord> records1 = Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"), new ThreeColumnRecord(1, "AAAAAAAAAA", "CCCC"));
    writeRecords(records1);
    List<ThreeColumnRecord> records2 = Lists.newArrayList(new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"), new ThreeColumnRecord(1, "BBBBBBBBBB", "DDDD"));
    writeRecords(records2);
    List<ThreeColumnRecord> records3 = Lists.newArrayList(new ThreeColumnRecord(2, "AAAAAAAAAA", "EEEE"), new ThreeColumnRecord(2, "AAAAAAAAAA", "GGGG"));
    writeRecords(records3);
    List<ThreeColumnRecord> records4 = Lists.newArrayList(new ThreeColumnRecord(2, "BBBBBBBBBB", "FFFF"), new ThreeColumnRecord(2, "BBBBBBBBBB", "HHHH"));
    writeRecords(records4);
    table.refresh();
    CloseableIterable<FileScanTask> tasks = table.newScan().planFiles();
    List<DataFile> dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file));
    Assert.assertEquals("Should have 8 data files before rewrite", 8, dataFiles.size());
    Actions actions = Actions.forTable(table);
    RewriteDataFilesActionResult result = actions.rewriteDataFiles().filter(Expressions.equal("c1", 1)).filter(Expressions.startsWith("c2", "AA")).execute();
    Assert.assertEquals("Action should rewrite 2 data files", 2, result.deletedDataFiles().size());
    Assert.assertEquals("Action should add 1 data file", 1, result.addedDataFiles().size());
    table.refresh();
    CloseableIterable<FileScanTask> tasks1 = table.newScan().planFiles();
    List<DataFile> dataFiles1 = Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file));
    Assert.assertEquals("Should have 7 data files before rewrite", 7, dataFiles1.size());
    List<ThreeColumnRecord> expectedRecords = Lists.newArrayList();
    expectedRecords.addAll(records1);
    expectedRecords.addAll(records2);
    expectedRecords.addAll(records3);
    expectedRecords.addAll(records4);
    Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
    List<ThreeColumnRecord> actualRecords = resultDF.sort("c1", "c2", "c3").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList();
    Assert.assertEquals("Rows must match", expectedRecords, actualRecords);
}
Also used : Table(org.apache.iceberg.Table) ThreeColumnRecord(org.apache.iceberg.spark.source.ThreeColumnRecord) PartitionSpec(org.apache.iceberg.PartitionSpec) DataFile(org.apache.iceberg.DataFile) Row(org.apache.spark.sql.Row) FileScanTask(org.apache.iceberg.FileScanTask) Test(org.junit.Test)

Example 4 with ThreeColumnRecord

use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.

the class TestRequiredDistributionAndOrdering method testRangeDistributionWithQuotedColumnNames.

@Test
public void testRangeDistributionWithQuotedColumnNames() throws NoSuchTableException {
    sql("CREATE TABLE %s (`c.1` INT, c2 STRING, c3 STRING) " + "USING iceberg " + "PARTITIONED BY (bucket(2, `c.1`))", tableName);
    List<ThreeColumnRecord> data = ImmutableList.of(new ThreeColumnRecord(1, null, "A"), new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), new ThreeColumnRecord(7, "BBBBBBBBBB", "A"));
    Dataset<Row> ds = spark.createDataFrame(data, ThreeColumnRecord.class);
    Dataset<Row> inputDF = ds.selectExpr("c1 as `c.1`", "c2", "c3").coalesce(1).sortWithinPartitions("`c.1`");
    sql("ALTER TABLE %s WRITE ORDERED BY `c.1`, c2", tableName);
    inputDF.writeTo(tableName).append();
    assertEquals("Row count must match", ImmutableList.of(row(7L)), sql("SELECT count(*) FROM %s", tableName));
}
Also used : Row(org.apache.spark.sql.Row) ThreeColumnRecord(org.apache.iceberg.spark.source.ThreeColumnRecord) Test(org.junit.Test)

Example 5 with ThreeColumnRecord

use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.

the class TestRequiredDistributionAndOrdering method testPartitionColumnsArePrependedForRangeDistribution.

@Test
public void testPartitionColumnsArePrependedForRangeDistribution() throws NoSuchTableException {
    sql("CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + "USING iceberg " + "PARTITIONED BY (bucket(2, c1))", tableName);
    List<ThreeColumnRecord> data = ImmutableList.of(new ThreeColumnRecord(1, null, "A"), new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), new ThreeColumnRecord(7, "BBBBBBBBBB", "A"));
    Dataset<Row> ds = spark.createDataFrame(data, ThreeColumnRecord.class);
    Dataset<Row> inputDF = ds.coalesce(1).sortWithinPartitions("c1");
    // should automatically prepend partition columns to the ordering
    sql("ALTER TABLE %s WRITE ORDERED BY c1, c2", tableName);
    inputDF.writeTo(tableName).append();
    assertEquals("Row count must match", ImmutableList.of(row(7L)), sql("SELECT count(*) FROM %s", tableName));
}
Also used : Row(org.apache.spark.sql.Row) ThreeColumnRecord(org.apache.iceberg.spark.source.ThreeColumnRecord) Test(org.junit.Test)

Aggregations

ThreeColumnRecord (org.apache.iceberg.spark.source.ThreeColumnRecord)34 Row (org.apache.spark.sql.Row)33 Test (org.junit.Test)32 Table (org.apache.iceberg.Table)26 PartitionSpec (org.apache.iceberg.PartitionSpec)18 DeleteOrphanFiles (org.apache.iceberg.actions.DeleteOrphanFiles)12 File (java.io.File)11 Configuration (org.apache.hadoop.conf.Configuration)11 List (java.util.List)10 Map (java.util.Map)10 Schema (org.apache.iceberg.Schema)10 Snapshot (org.apache.iceberg.Snapshot)10 TableProperties (org.apache.iceberg.TableProperties)10 HadoopTables (org.apache.iceberg.hadoop.HadoopTables)10 Lists (org.apache.iceberg.relocated.com.google.common.collect.Lists)10 Maps (org.apache.iceberg.relocated.com.google.common.collect.Maps)10 SparkTestBase (org.apache.iceberg.spark.SparkTestBase)10 Types (org.apache.iceberg.types.Types)10 NestedField.optional (org.apache.iceberg.types.Types.NestedField.optional)10 Dataset (org.apache.spark.sql.Dataset)10