use of org.apache.iceberg.RowDelta in project iceberg by apache.
the class TestPositionDeltaWriters method testPositionDeltaDeleteOnly.
@Test
public void testPositionDeltaDeleteOnly() throws IOException {
FileWriterFactory<T> writerFactory = newWriterFactory(table.schema());
// add an unpartitioned data file
ImmutableList<T> rows1 = ImmutableList.of(toRow(1, "aaa"), toRow(2, "aaa"), toRow(11, "aaa"));
DataFile dataFile1 = writeData(writerFactory, fileFactory, rows1, table.spec(), null);
table.newFastAppend().appendFile(dataFile1).commit();
// partition by data
table.updateSpec().addField(Expressions.ref("data")).commit();
// add a data file partitioned by data
ImmutableList<T> rows2 = ImmutableList.of(toRow(3, "bbb"), toRow(4, "bbb"));
DataFile dataFile2 = writeData(writerFactory, fileFactory, rows2, table.spec(), partitionKey(table.spec(), "bbb"));
table.newFastAppend().appendFile(dataFile2).commit();
PartitionSpec unpartitionedSpec = table.specs().get(0);
PartitionSpec partitionedSpec = table.specs().get(1);
ClusteredDataWriter<T> insertWriter = new ClusteredDataWriter<>(writerFactory, fileFactory, table.io(), fileFormat, TARGET_FILE_SIZE);
ClusteredDataWriter<T> updateWriter = new ClusteredDataWriter<>(writerFactory, fileFactory, table.io(), fileFormat, TARGET_FILE_SIZE);
ClusteredPositionDeleteWriter<T> deleteWriter = new ClusteredPositionDeleteWriter<>(writerFactory, fileFactory, table.io(), fileFormat, TARGET_FILE_SIZE);
PositionDeltaWriter<T> deltaWriter = new BasePositionDeltaWriter<>(insertWriter, updateWriter, deleteWriter);
deltaWriter.delete(dataFile1.path(), 2L, unpartitionedSpec, null);
deltaWriter.delete(dataFile2.path(), 1L, partitionedSpec, partitionKey(partitionedSpec, "bbb"));
deltaWriter.close();
WriteResult result = deltaWriter.result();
DataFile[] dataFiles = result.dataFiles();
DeleteFile[] deleteFiles = result.deleteFiles();
CharSequence[] referencedDataFiles = result.referencedDataFiles();
Assert.assertEquals("Must be 0 data files", 0, dataFiles.length);
Assert.assertEquals("Must be 2 delete files", 2, deleteFiles.length);
Assert.assertEquals("Must reference 2 data files", 2, referencedDataFiles.length);
RowDelta rowDelta = table.newRowDelta();
for (DeleteFile deleteFile : deleteFiles) {
rowDelta.addDeletes(deleteFile);
}
rowDelta.commit();
List<T> expectedRows = ImmutableList.of(toRow(1, "aaa"), toRow(2, "aaa"), toRow(3, "bbb"));
Assert.assertEquals("Records should match", toSet(expectedRows), actualRowSet("*"));
}
use of org.apache.iceberg.RowDelta in project iceberg by apache.
the class TestPositionDeltaWriters method testPositionDeltaMultipleSpecs.
@Test
public void testPositionDeltaMultipleSpecs() throws IOException {
FileWriterFactory<T> writerFactory = newWriterFactory(table.schema());
// add an unpartitioned data file
ImmutableList<T> rows1 = ImmutableList.of(toRow(1, "aaa"), toRow(2, "aaa"), toRow(11, "aaa"));
DataFile dataFile1 = writeData(writerFactory, fileFactory, rows1, table.spec(), null);
table.newFastAppend().appendFile(dataFile1).commit();
// partition by data
table.updateSpec().addField(Expressions.ref("data")).commit();
// add a data file partitioned by data
ImmutableList<T> rows2 = ImmutableList.of(toRow(3, "bbb"), toRow(4, "bbb"));
DataFile dataFile2 = writeData(writerFactory, fileFactory, rows2, table.spec(), partitionKey(table.spec(), "bbb"));
table.newFastAppend().appendFile(dataFile2).commit();
PartitionSpec unpartitionedSpec = table.specs().get(0);
PartitionSpec partitionedSpec = table.specs().get(1);
ClusteredDataWriter<T> insertWriter = new ClusteredDataWriter<>(writerFactory, fileFactory, table.io(), fileFormat, TARGET_FILE_SIZE);
ClusteredDataWriter<T> updateWriter = new ClusteredDataWriter<>(writerFactory, fileFactory, table.io(), fileFormat, TARGET_FILE_SIZE);
ClusteredPositionDeleteWriter<T> deleteWriter = new ClusteredPositionDeleteWriter<>(writerFactory, fileFactory, table.io(), fileFormat, TARGET_FILE_SIZE);
PositionDeltaWriter<T> deltaWriter = new BasePositionDeltaWriter<>(insertWriter, updateWriter, deleteWriter);
deltaWriter.delete(dataFile1.path(), 2L, unpartitionedSpec, null);
deltaWriter.delete(dataFile2.path(), 1L, partitionedSpec, partitionKey(partitionedSpec, "bbb"));
deltaWriter.insert(toRow(10, "ccc"), partitionedSpec, partitionKey(partitionedSpec, "ccc"));
deltaWriter.close();
WriteResult result = deltaWriter.result();
DataFile[] dataFiles = result.dataFiles();
DeleteFile[] deleteFiles = result.deleteFiles();
CharSequence[] referencedDataFiles = result.referencedDataFiles();
Assert.assertEquals("Must be 1 data files", 1, dataFiles.length);
Assert.assertEquals("Must be 2 delete files", 2, deleteFiles.length);
Assert.assertEquals("Must reference 2 data files", 2, referencedDataFiles.length);
RowDelta rowDelta = table.newRowDelta();
for (DataFile dataFile : dataFiles) {
rowDelta.addRows(dataFile);
}
for (DeleteFile deleteFile : deleteFiles) {
rowDelta.addDeletes(deleteFile);
}
rowDelta.commit();
List<T> expectedRows = ImmutableList.of(toRow(1, "aaa"), toRow(2, "aaa"), toRow(3, "bbb"), toRow(10, "ccc"));
Assert.assertEquals("Records should match", toSet(expectedRows), actualRowSet("*"));
}
use of org.apache.iceberg.RowDelta in project iceberg by apache.
the class TestTaskEqualityDeltaWriter method commitTransaction.
private void commitTransaction(WriteResult result) {
RowDelta rowDelta = table.newRowDelta();
Arrays.stream(result.dataFiles()).forEach(rowDelta::addRows);
Arrays.stream(result.deleteFiles()).forEach(rowDelta::addDeletes);
rowDelta.validateDeletedFiles().validateDataFilesExist(Lists.newArrayList(result.referencedDataFiles())).commit();
}
use of org.apache.iceberg.RowDelta in project iceberg by apache.
the class IcebergSourceDeleteBenchmark method writeEqDeletes.
private void writeEqDeletes(List<InternalRow> rows) throws IOException {
int equalityFieldId = table().schema().findField("longCol").fieldId();
OutputFileFactory fileFactory = newFileFactory();
SparkFileWriterFactory writerFactory = SparkFileWriterFactory.builderFor(table()).dataFileFormat(fileFormat()).equalityDeleteRowSchema(table().schema()).equalityFieldIds(new int[] { equalityFieldId }).build();
ClusteredEqualityDeleteWriter<InternalRow> writer = new ClusteredEqualityDeleteWriter<>(writerFactory, fileFactory, table().io(), fileFormat(), TARGET_FILE_SIZE_IN_BYTES);
PartitionSpec unpartitionedSpec = table().specs().get(0);
try (ClusteredEqualityDeleteWriter<InternalRow> closeableWriter = writer) {
for (InternalRow row : rows) {
closeableWriter.write(row, unpartitionedSpec, null);
}
}
RowDelta rowDelta = table().newRowDelta();
LOG.info("Num of Delete File: {}", writer.result().deleteFiles().size());
writer.result().deleteFiles().forEach(rowDelta::addDeletes);
rowDelta.validateDeletedFiles().commit();
}
use of org.apache.iceberg.RowDelta in project iceberg by apache.
the class TestRewriteDataFilesAction method testBinPackWithDeleteAllData.
@Test
public void testBinPackWithDeleteAllData() {
Map<String, String> options = Maps.newHashMap();
options.put(TableProperties.FORMAT_VERSION, "2");
Table table = createTablePartitioned(1, 1, 1, options);
shouldHaveFiles(table, 1);
table.refresh();
CloseableIterable<FileScanTask> tasks = table.newScan().planFiles();
List<DataFile> dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file));
int total = (int) dataFiles.stream().mapToLong(ContentFile::recordCount).sum();
RowDelta rowDelta = table.newRowDelta();
// remove all data
writePosDeletesToFile(table, dataFiles.get(0), total).forEach(rowDelta::addDeletes);
rowDelta.commit();
table.refresh();
List<Object[]> expectedRecords = currentData();
Result result = actions().rewriteDataFiles(table).option(BinPackStrategy.DELETE_FILE_THRESHOLD, "1").execute();
Assert.assertEquals("Action should rewrite 1 data files", 1, result.rewrittenDataFilesCount());
List<Object[]> actualRecords = currentData();
assertEquals("Rows must match", expectedRecords, actualRecords);
Assert.assertEquals("Data manifest should not have existing data file", 0, (long) table.currentSnapshot().dataManifests().get(0).existingFilesCount());
Assert.assertEquals("Data manifest should have 1 delete data file", 1L, (long) table.currentSnapshot().dataManifests().get(0).deletedFilesCount());
Assert.assertEquals("Delete manifest added row count should equal total count", total, (long) table.currentSnapshot().deleteManifests().get(0).addedRowsCount());
}
Aggregations