Search in sources :

Example 16 with RowDelta

use of org.apache.iceberg.RowDelta in project iceberg by apache.

the class TestPositionDeltaWriters method testPositionDeltaDeleteOnly.

@Test
public void testPositionDeltaDeleteOnly() throws IOException {
    FileWriterFactory<T> writerFactory = newWriterFactory(table.schema());
    // add an unpartitioned data file
    ImmutableList<T> rows1 = ImmutableList.of(toRow(1, "aaa"), toRow(2, "aaa"), toRow(11, "aaa"));
    DataFile dataFile1 = writeData(writerFactory, fileFactory, rows1, table.spec(), null);
    table.newFastAppend().appendFile(dataFile1).commit();
    // partition by data
    table.updateSpec().addField(Expressions.ref("data")).commit();
    // add a data file partitioned by data
    ImmutableList<T> rows2 = ImmutableList.of(toRow(3, "bbb"), toRow(4, "bbb"));
    DataFile dataFile2 = writeData(writerFactory, fileFactory, rows2, table.spec(), partitionKey(table.spec(), "bbb"));
    table.newFastAppend().appendFile(dataFile2).commit();
    PartitionSpec unpartitionedSpec = table.specs().get(0);
    PartitionSpec partitionedSpec = table.specs().get(1);
    ClusteredDataWriter<T> insertWriter = new ClusteredDataWriter<>(writerFactory, fileFactory, table.io(), fileFormat, TARGET_FILE_SIZE);
    ClusteredDataWriter<T> updateWriter = new ClusteredDataWriter<>(writerFactory, fileFactory, table.io(), fileFormat, TARGET_FILE_SIZE);
    ClusteredPositionDeleteWriter<T> deleteWriter = new ClusteredPositionDeleteWriter<>(writerFactory, fileFactory, table.io(), fileFormat, TARGET_FILE_SIZE);
    PositionDeltaWriter<T> deltaWriter = new BasePositionDeltaWriter<>(insertWriter, updateWriter, deleteWriter);
    deltaWriter.delete(dataFile1.path(), 2L, unpartitionedSpec, null);
    deltaWriter.delete(dataFile2.path(), 1L, partitionedSpec, partitionKey(partitionedSpec, "bbb"));
    deltaWriter.close();
    WriteResult result = deltaWriter.result();
    DataFile[] dataFiles = result.dataFiles();
    DeleteFile[] deleteFiles = result.deleteFiles();
    CharSequence[] referencedDataFiles = result.referencedDataFiles();
    Assert.assertEquals("Must be 0 data files", 0, dataFiles.length);
    Assert.assertEquals("Must be 2 delete files", 2, deleteFiles.length);
    Assert.assertEquals("Must reference 2 data files", 2, referencedDataFiles.length);
    RowDelta rowDelta = table.newRowDelta();
    for (DeleteFile deleteFile : deleteFiles) {
        rowDelta.addDeletes(deleteFile);
    }
    rowDelta.commit();
    List<T> expectedRows = ImmutableList.of(toRow(1, "aaa"), toRow(2, "aaa"), toRow(3, "bbb"));
    Assert.assertEquals("Records should match", toSet(expectedRows), actualRowSet("*"));
}
Also used : RowDelta(org.apache.iceberg.RowDelta) PartitionSpec(org.apache.iceberg.PartitionSpec) DataFile(org.apache.iceberg.DataFile) DeleteFile(org.apache.iceberg.DeleteFile) Test(org.junit.Test)

Example 17 with RowDelta

use of org.apache.iceberg.RowDelta in project iceberg by apache.

the class TestPositionDeltaWriters method testPositionDeltaMultipleSpecs.

@Test
public void testPositionDeltaMultipleSpecs() throws IOException {
    FileWriterFactory<T> writerFactory = newWriterFactory(table.schema());
    // add an unpartitioned data file
    ImmutableList<T> rows1 = ImmutableList.of(toRow(1, "aaa"), toRow(2, "aaa"), toRow(11, "aaa"));
    DataFile dataFile1 = writeData(writerFactory, fileFactory, rows1, table.spec(), null);
    table.newFastAppend().appendFile(dataFile1).commit();
    // partition by data
    table.updateSpec().addField(Expressions.ref("data")).commit();
    // add a data file partitioned by data
    ImmutableList<T> rows2 = ImmutableList.of(toRow(3, "bbb"), toRow(4, "bbb"));
    DataFile dataFile2 = writeData(writerFactory, fileFactory, rows2, table.spec(), partitionKey(table.spec(), "bbb"));
    table.newFastAppend().appendFile(dataFile2).commit();
    PartitionSpec unpartitionedSpec = table.specs().get(0);
    PartitionSpec partitionedSpec = table.specs().get(1);
    ClusteredDataWriter<T> insertWriter = new ClusteredDataWriter<>(writerFactory, fileFactory, table.io(), fileFormat, TARGET_FILE_SIZE);
    ClusteredDataWriter<T> updateWriter = new ClusteredDataWriter<>(writerFactory, fileFactory, table.io(), fileFormat, TARGET_FILE_SIZE);
    ClusteredPositionDeleteWriter<T> deleteWriter = new ClusteredPositionDeleteWriter<>(writerFactory, fileFactory, table.io(), fileFormat, TARGET_FILE_SIZE);
    PositionDeltaWriter<T> deltaWriter = new BasePositionDeltaWriter<>(insertWriter, updateWriter, deleteWriter);
    deltaWriter.delete(dataFile1.path(), 2L, unpartitionedSpec, null);
    deltaWriter.delete(dataFile2.path(), 1L, partitionedSpec, partitionKey(partitionedSpec, "bbb"));
    deltaWriter.insert(toRow(10, "ccc"), partitionedSpec, partitionKey(partitionedSpec, "ccc"));
    deltaWriter.close();
    WriteResult result = deltaWriter.result();
    DataFile[] dataFiles = result.dataFiles();
    DeleteFile[] deleteFiles = result.deleteFiles();
    CharSequence[] referencedDataFiles = result.referencedDataFiles();
    Assert.assertEquals("Must be 1 data files", 1, dataFiles.length);
    Assert.assertEquals("Must be 2 delete files", 2, deleteFiles.length);
    Assert.assertEquals("Must reference 2 data files", 2, referencedDataFiles.length);
    RowDelta rowDelta = table.newRowDelta();
    for (DataFile dataFile : dataFiles) {
        rowDelta.addRows(dataFile);
    }
    for (DeleteFile deleteFile : deleteFiles) {
        rowDelta.addDeletes(deleteFile);
    }
    rowDelta.commit();
    List<T> expectedRows = ImmutableList.of(toRow(1, "aaa"), toRow(2, "aaa"), toRow(3, "bbb"), toRow(10, "ccc"));
    Assert.assertEquals("Records should match", toSet(expectedRows), actualRowSet("*"));
}
Also used : RowDelta(org.apache.iceberg.RowDelta) PartitionSpec(org.apache.iceberg.PartitionSpec) DataFile(org.apache.iceberg.DataFile) DeleteFile(org.apache.iceberg.DeleteFile) Test(org.junit.Test)

Example 18 with RowDelta

use of org.apache.iceberg.RowDelta in project iceberg by apache.

the class TestTaskEqualityDeltaWriter method commitTransaction.

private void commitTransaction(WriteResult result) {
    RowDelta rowDelta = table.newRowDelta();
    Arrays.stream(result.dataFiles()).forEach(rowDelta::addRows);
    Arrays.stream(result.deleteFiles()).forEach(rowDelta::addDeletes);
    rowDelta.validateDeletedFiles().validateDataFilesExist(Lists.newArrayList(result.referencedDataFiles())).commit();
}
Also used : RowDelta(org.apache.iceberg.RowDelta)

Example 19 with RowDelta

use of org.apache.iceberg.RowDelta in project iceberg by apache.

the class IcebergSourceDeleteBenchmark method writeEqDeletes.

private void writeEqDeletes(List<InternalRow> rows) throws IOException {
    int equalityFieldId = table().schema().findField("longCol").fieldId();
    OutputFileFactory fileFactory = newFileFactory();
    SparkFileWriterFactory writerFactory = SparkFileWriterFactory.builderFor(table()).dataFileFormat(fileFormat()).equalityDeleteRowSchema(table().schema()).equalityFieldIds(new int[] { equalityFieldId }).build();
    ClusteredEqualityDeleteWriter<InternalRow> writer = new ClusteredEqualityDeleteWriter<>(writerFactory, fileFactory, table().io(), fileFormat(), TARGET_FILE_SIZE_IN_BYTES);
    PartitionSpec unpartitionedSpec = table().specs().get(0);
    try (ClusteredEqualityDeleteWriter<InternalRow> closeableWriter = writer) {
        for (InternalRow row : rows) {
            closeableWriter.write(row, unpartitionedSpec, null);
        }
    }
    RowDelta rowDelta = table().newRowDelta();
    LOG.info("Num of Delete File: {}", writer.result().deleteFiles().size());
    writer.result().deleteFiles().forEach(rowDelta::addDeletes);
    rowDelta.validateDeletedFiles().commit();
}
Also used : OutputFileFactory(org.apache.iceberg.io.OutputFileFactory) ClusteredEqualityDeleteWriter(org.apache.iceberg.io.ClusteredEqualityDeleteWriter) RowDelta(org.apache.iceberg.RowDelta) PartitionSpec(org.apache.iceberg.PartitionSpec) InternalRow(org.apache.spark.sql.catalyst.InternalRow) GenericInternalRow(org.apache.spark.sql.catalyst.expressions.GenericInternalRow)

Example 20 with RowDelta

use of org.apache.iceberg.RowDelta in project iceberg by apache.

the class TestRewriteDataFilesAction method testBinPackWithDeleteAllData.

@Test
public void testBinPackWithDeleteAllData() {
    Map<String, String> options = Maps.newHashMap();
    options.put(TableProperties.FORMAT_VERSION, "2");
    Table table = createTablePartitioned(1, 1, 1, options);
    shouldHaveFiles(table, 1);
    table.refresh();
    CloseableIterable<FileScanTask> tasks = table.newScan().planFiles();
    List<DataFile> dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file));
    int total = (int) dataFiles.stream().mapToLong(ContentFile::recordCount).sum();
    RowDelta rowDelta = table.newRowDelta();
    // remove all data
    writePosDeletesToFile(table, dataFiles.get(0), total).forEach(rowDelta::addDeletes);
    rowDelta.commit();
    table.refresh();
    List<Object[]> expectedRecords = currentData();
    Result result = actions().rewriteDataFiles(table).option(BinPackStrategy.DELETE_FILE_THRESHOLD, "1").execute();
    Assert.assertEquals("Action should rewrite 1 data files", 1, result.rewrittenDataFilesCount());
    List<Object[]> actualRecords = currentData();
    assertEquals("Rows must match", expectedRecords, actualRecords);
    Assert.assertEquals("Data manifest should not have existing data file", 0, (long) table.currentSnapshot().dataManifests().get(0).existingFilesCount());
    Assert.assertEquals("Data manifest should have 1 delete data file", 1L, (long) table.currentSnapshot().dataManifests().get(0).deletedFilesCount());
    Assert.assertEquals("Delete manifest added row count should equal total count", total, (long) table.currentSnapshot().deleteManifests().get(0).addedRowsCount());
}
Also used : Table(org.apache.iceberg.Table) ContentFile(org.apache.iceberg.ContentFile) RowDelta(org.apache.iceberg.RowDelta) Result(org.apache.iceberg.actions.RewriteDataFiles.Result) DataFile(org.apache.iceberg.DataFile) FileScanTask(org.apache.iceberg.FileScanTask) Test(org.junit.Test)

Aggregations

RowDelta (org.apache.iceberg.RowDelta)20 Test (org.junit.Test)13 DataFile (org.apache.iceberg.DataFile)9 PartitionSpec (org.apache.iceberg.PartitionSpec)8 DeleteFile (org.apache.iceberg.DeleteFile)5 Table (org.apache.iceberg.Table)5 GenericRecord (org.apache.iceberg.data.GenericRecord)4 Record (org.apache.iceberg.data.Record)4 IOException (java.io.IOException)3 Comparator (java.util.Comparator)3 List (java.util.List)3 Lists (org.apache.iceberg.relocated.com.google.common.collect.Lists)3 Collections (java.util.Collections)2 Map (java.util.Map)2 Context (org.apache.hadoop.hive.ql.Context)2 AppendFiles (org.apache.iceberg.AppendFiles)2 ContentFile (org.apache.iceberg.ContentFile)2 FileScanTask (org.apache.iceberg.FileScanTask)2 MetadataColumns (org.apache.iceberg.MetadataColumns)2 PartitionKey (org.apache.iceberg.PartitionKey)2