Search in sources :

Example 1 with RowDelta

use of org.apache.iceberg.RowDelta in project iceberg by apache.

the class IcebergSourceDeleteBenchmark method writePosDeletes.

protected void writePosDeletes(CharSequence path, List<Long> deletedPos, int numNoise) throws IOException {
    OutputFileFactory fileFactory = newFileFactory();
    SparkFileWriterFactory writerFactory = SparkFileWriterFactory.builderFor(table()).dataFileFormat(fileFormat()).build();
    ClusteredPositionDeleteWriter<InternalRow> writer = new ClusteredPositionDeleteWriter<>(writerFactory, fileFactory, table().io(), fileFormat(), TARGET_FILE_SIZE_IN_BYTES);
    PartitionSpec unpartitionedSpec = table().specs().get(0);
    PositionDelete<InternalRow> positionDelete = PositionDelete.create();
    try (ClusteredPositionDeleteWriter<InternalRow> closeableWriter = writer) {
        for (Long pos : deletedPos) {
            positionDelete.set(path, pos, null);
            closeableWriter.write(positionDelete, unpartitionedSpec, null);
            for (int i = 0; i < numNoise; i++) {
                positionDelete.set(noisePath(path), pos, null);
                closeableWriter.write(positionDelete, unpartitionedSpec, null);
            }
        }
    }
    RowDelta rowDelta = table().newRowDelta();
    writer.result().deleteFiles().forEach(rowDelta::addDeletes);
    rowDelta.validateDeletedFiles().commit();
}
Also used : OutputFileFactory(org.apache.iceberg.io.OutputFileFactory) ClusteredPositionDeleteWriter(org.apache.iceberg.io.ClusteredPositionDeleteWriter) RowDelta(org.apache.iceberg.RowDelta) PartitionSpec(org.apache.iceberg.PartitionSpec) InternalRow(org.apache.spark.sql.catalyst.InternalRow) GenericInternalRow(org.apache.spark.sql.catalyst.expressions.GenericInternalRow)

Example 2 with RowDelta

use of org.apache.iceberg.RowDelta in project iceberg by apache.

the class TestRewriteDataFilesAction method testBinPackWithDeletes.

@Test
public void testBinPackWithDeletes() throws Exception {
    Table table = createTablePartitioned(4, 2);
    table.updateProperties().set(TableProperties.FORMAT_VERSION, "2").commit();
    shouldHaveFiles(table, 8);
    table.refresh();
    CloseableIterable<FileScanTask> tasks = table.newScan().planFiles();
    List<DataFile> dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file));
    int total = (int) dataFiles.stream().mapToLong(ContentFile::recordCount).sum();
    RowDelta rowDelta = table.newRowDelta();
    // add 1 delete file for data files 0, 1, 2
    for (int i = 0; i < 3; i++) {
        writePosDeletesToFile(table, dataFiles.get(i), 1).forEach(rowDelta::addDeletes);
    }
    // add 2 delete files for data files 3, 4
    for (int i = 3; i < 5; i++) {
        writePosDeletesToFile(table, dataFiles.get(i), 2).forEach(rowDelta::addDeletes);
    }
    rowDelta.commit();
    table.refresh();
    List<Object[]> expectedRecords = currentData();
    Result result = actions().rewriteDataFiles(table).option(BinPackStrategy.MIN_FILE_SIZE_BYTES, "0").option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Long.toString(Long.MAX_VALUE - 1)).option(BinPackStrategy.MAX_FILE_SIZE_BYTES, Long.toString(Long.MAX_VALUE)).option(BinPackStrategy.DELETE_FILE_THRESHOLD, "2").execute();
    Assert.assertEquals("Action should rewrite 2 data files", 2, result.rewrittenDataFilesCount());
    List<Object[]> actualRecords = currentData();
    assertEquals("Rows must match", expectedRecords, actualRecords);
    Assert.assertEquals("7 rows are removed", total - 7, actualRecords.size());
}
Also used : DataFile(org.apache.iceberg.DataFile) Table(org.apache.iceberg.Table) ContentFile(org.apache.iceberg.ContentFile) FileScanTask(org.apache.iceberg.FileScanTask) RowDelta(org.apache.iceberg.RowDelta) Result(org.apache.iceberg.actions.RewriteDataFiles.Result) Test(org.junit.Test)

Example 3 with RowDelta

use of org.apache.iceberg.RowDelta in project iceberg by apache.

the class TestBaseTaskWriter method testRollIfExceedTargetFileSize.

@Test
public void testRollIfExceedTargetFileSize() throws IOException {
    List<Record> records = Lists.newArrayListWithCapacity(8000);
    for (int i = 0; i < 2000; i++) {
        records.add(createRecord(i, "aaa"));
        records.add(createRecord(i, "bbb"));
        records.add(createRecord(i, "ccc"));
        records.add(createRecord(i, "ddd"));
    }
    WriteResult result;
    try (TaskWriter<Record> taskWriter = createTaskWriter(4)) {
        for (Record record : records) {
            taskWriter.write(record);
        }
        result = taskWriter.complete();
        Assert.assertEquals(8, result.dataFiles().length);
        Assert.assertEquals(0, result.deleteFiles().length);
    }
    RowDelta rowDelta = table.newRowDelta();
    Arrays.stream(result.dataFiles()).forEach(rowDelta::addRows);
    rowDelta.commit();
    List<Record> expected = Lists.newArrayList();
    try (TestTaskWriter taskWriter = createTaskWriter(3)) {
        for (Record record : records) {
            // ex: UPSERT <0, 'aaa'> to <0, 'AAA'>
            taskWriter.delete(record);
            int id = record.get(0, Integer.class);
            String data = record.get(1, String.class);
            Record newRecord = createRecord(id, data.toUpperCase());
            expected.add(newRecord);
            taskWriter.write(newRecord);
        }
        result = taskWriter.complete();
        Assert.assertEquals(8, result.dataFiles().length);
        Assert.assertEquals(8, result.deleteFiles().length);
    }
    rowDelta = table.newRowDelta();
    Arrays.stream(result.dataFiles()).forEach(rowDelta::addRows);
    Arrays.stream(result.deleteFiles()).forEach(rowDelta::addDeletes);
    rowDelta.commit();
    Assert.assertEquals("Should have expected records", expectedRowSet(expected), actualRowSet("*"));
}
Also used : Record(org.apache.iceberg.data.Record) GenericRecord(org.apache.iceberg.data.GenericRecord) RowDelta(org.apache.iceberg.RowDelta) Test(org.junit.Test)

Example 4 with RowDelta

use of org.apache.iceberg.RowDelta in project iceberg by apache.

the class TestGenericSortedPosDeleteWriter method testMultipleFlush.

@Test
public void testMultipleFlush() throws IOException {
    FileAppenderFactory<Record> appenderFactory = new GenericAppenderFactory(table.schema(), table.spec(), null, null, null);
    // It will produce 5 record lists, each list will write into a separate data file:
    // The 1th file has: <0  , val-0>   , <1  , val-1>   , ... , <99 , val-99>
    // The 2th file has: <100, val-100> , <101, val-101> , ... , <199, val-199>
    // The 3th file has: <200, val-200> , <201, val-201> , ... , <299, val-299>
    // The 4th file has: <300, val-300> , <301, val-301> , ... , <399, val-399>
    // The 5th file has: <400, val-400> , <401, val-401> , ... , <499, val-499>
    List<DataFile> dataFiles = Lists.newArrayList();
    for (int fileIndex = 0; fileIndex < 5; fileIndex++) {
        List<Record> recordList = Lists.newLinkedList();
        for (int recordIndex = 0; recordIndex < 100; recordIndex++) {
            int id = fileIndex * 100 + recordIndex;
            recordList.add(createRow(id, String.format("val-%s", id)));
        }
        // Write the records and generate the data file.
        dataFiles.add(prepareDataFile(appenderFactory, recordList));
    }
    // Commit those data files to iceberg table.
    RowDelta rowDelta = table.newRowDelta();
    dataFiles.forEach(rowDelta::addRows);
    rowDelta.commit();
    SortedPosDeleteWriter<Record> writer = new SortedPosDeleteWriter<>(appenderFactory, fileFactory, format, null, 50);
    try (SortedPosDeleteWriter<Record> closeableWriter = writer) {
        for (int pos = 0; pos < 100; pos++) {
            for (int fileIndex = 4; fileIndex >= 0; fileIndex--) {
                closeableWriter.delete(dataFiles.get(fileIndex).path(), pos);
            }
        }
    }
    List<DeleteFile> deleteFiles = writer.complete();
    Assert.assertEquals(10, deleteFiles.size());
    Schema pathPosSchema = DeleteSchemaUtil.pathPosSchema();
    Record record = GenericRecord.create(pathPosSchema);
    for (int deleteFileIndex = 0; deleteFileIndex < 10; deleteFileIndex++) {
        List<Record> expectedDeletes = Lists.newArrayList();
        for (int dataFileIndex = 0; dataFileIndex < 5; dataFileIndex++) {
            DataFile dataFile = dataFiles.get(dataFileIndex);
            for (long pos = deleteFileIndex * 10; pos < deleteFileIndex * 10 + 10; pos++) {
                expectedDeletes.add(record.copy("file_path", dataFile.path(), "pos", pos));
            }
        }
        DeleteFile deleteFile = deleteFiles.get(deleteFileIndex);
        Assert.assertEquals(expectedDeletes, readRecordsAsList(pathPosSchema, deleteFile.path()));
    }
    rowDelta = table.newRowDelta();
    deleteFiles.forEach(rowDelta::addDeletes);
    rowDelta.commit();
    Assert.assertEquals("Should have no record.", expectedRowSet(ImmutableList.of()), actualRowSet("*"));
}
Also used : Schema(org.apache.iceberg.Schema) RowDelta(org.apache.iceberg.RowDelta) GenericAppenderFactory(org.apache.iceberg.data.GenericAppenderFactory) DataFile(org.apache.iceberg.DataFile) GenericRecord(org.apache.iceberg.data.GenericRecord) Record(org.apache.iceberg.data.Record) DeleteFile(org.apache.iceberg.DeleteFile) Test(org.junit.Test)

Example 5 with RowDelta

use of org.apache.iceberg.RowDelta in project iceberg by apache.

the class TestPartitioningWriters method testFanoutDataWriterMultiplePartitions.

@Test
public void testFanoutDataWriterMultiplePartitions() throws IOException {
    table.updateSpec().addField(Expressions.ref("data")).commit();
    FileWriterFactory<T> writerFactory = newWriterFactory(table.schema());
    FanoutDataWriter<T> writer = new FanoutDataWriter<>(writerFactory, fileFactory, table.io(), fileFormat, TARGET_FILE_SIZE);
    PartitionSpec spec = table.spec();
    writer.write(toRow(1, "aaa"), spec, partitionKey(spec, "aaa"));
    writer.write(toRow(3, "bbb"), spec, partitionKey(spec, "bbb"));
    writer.write(toRow(2, "aaa"), spec, partitionKey(spec, "aaa"));
    writer.write(toRow(4, "bbb"), spec, partitionKey(spec, "bbb"));
    writer.write(toRow(5, "ccc"), spec, partitionKey(spec, "ccc"));
    writer.close();
    DataWriteResult result = writer.result();
    Assert.assertEquals("Must be 3 data files", 3, result.dataFiles().size());
    RowDelta rowDelta = table.newRowDelta();
    result.dataFiles().forEach(rowDelta::addRows);
    rowDelta.commit();
    List<T> expectedRows = ImmutableList.of(toRow(1, "aaa"), toRow(2, "aaa"), toRow(3, "bbb"), toRow(4, "bbb"), toRow(5, "ccc"));
    Assert.assertEquals("Records should match", toSet(expectedRows), actualRowSet("*"));
}
Also used : RowDelta(org.apache.iceberg.RowDelta) PartitionSpec(org.apache.iceberg.PartitionSpec) Test(org.junit.Test)

Aggregations

RowDelta (org.apache.iceberg.RowDelta)20 Test (org.junit.Test)13 DataFile (org.apache.iceberg.DataFile)9 PartitionSpec (org.apache.iceberg.PartitionSpec)8 DeleteFile (org.apache.iceberg.DeleteFile)5 Table (org.apache.iceberg.Table)5 GenericRecord (org.apache.iceberg.data.GenericRecord)4 Record (org.apache.iceberg.data.Record)4 IOException (java.io.IOException)3 Comparator (java.util.Comparator)3 List (java.util.List)3 Lists (org.apache.iceberg.relocated.com.google.common.collect.Lists)3 Collections (java.util.Collections)2 Map (java.util.Map)2 Context (org.apache.hadoop.hive.ql.Context)2 AppendFiles (org.apache.iceberg.AppendFiles)2 ContentFile (org.apache.iceberg.ContentFile)2 FileScanTask (org.apache.iceberg.FileScanTask)2 MetadataColumns (org.apache.iceberg.MetadataColumns)2 PartitionKey (org.apache.iceberg.PartitionKey)2