use of org.apache.iceberg.RowDelta in project iceberg by apache.
the class IcebergFilesCommitter method commitDeltaTxn.
private void commitDeltaTxn(NavigableMap<Long, WriteResult> pendingResults, String newFlinkJobId, long checkpointId) {
int deleteFilesNum = pendingResults.values().stream().mapToInt(r -> r.deleteFiles().length).sum();
if (deleteFilesNum == 0) {
// To be compatible with iceberg format V1.
AppendFiles appendFiles = table.newAppend().scanManifestsWith(workerPool);
int numFiles = 0;
for (WriteResult result : pendingResults.values()) {
Preconditions.checkState(result.referencedDataFiles().length == 0, "Should have no referenced data files.");
numFiles += result.dataFiles().length;
Arrays.stream(result.dataFiles()).forEach(appendFiles::appendFile);
}
commitOperation(appendFiles, numFiles, 0, "append", newFlinkJobId, checkpointId);
} else {
// To be compatible with iceberg format V2.
for (Map.Entry<Long, WriteResult> e : pendingResults.entrySet()) {
// We don't commit the merged result into a single transaction because for the sequential transaction txn1 and
// txn2, the equality-delete files of txn2 are required to be applied to data files from txn1. Committing the
// merged one will lead to the incorrect delete semantic.
WriteResult result = e.getValue();
// Row delta validations are not needed for streaming changes that write equality deletes. Equality deletes
// are applied to data in all previous sequence numbers, so retries may push deletes further in the future,
// but do not affect correctness. Position deletes committed to the table in this path are used only to delete
// rows from data files that are being added in this commit. There is no way for data files added along with
// the delete files to be concurrently removed, so there is no need to validate the files referenced by the
// position delete files that are being committed.
RowDelta rowDelta = table.newRowDelta().scanManifestsWith(workerPool);
int numDataFiles = result.dataFiles().length;
Arrays.stream(result.dataFiles()).forEach(rowDelta::addRows);
int numDeleteFiles = result.deleteFiles().length;
Arrays.stream(result.deleteFiles()).forEach(rowDelta::addDeletes);
commitOperation(rowDelta, numDataFiles, numDeleteFiles, "rowDelta", newFlinkJobId, e.getKey());
}
}
}
use of org.apache.iceberg.RowDelta in project iceberg by apache.
the class TestDeltaTaskWriter method commitTransaction.
private void commitTransaction(WriteResult result) {
RowDelta rowDelta = table.newRowDelta();
Arrays.stream(result.dataFiles()).forEach(rowDelta::addRows);
Arrays.stream(result.deleteFiles()).forEach(rowDelta::addDeletes);
rowDelta.validateDeletedFiles().validateDataFilesExist(Lists.newArrayList(result.referencedDataFiles())).commit();
}
use of org.apache.iceberg.RowDelta in project iceberg by apache.
the class TestPartitioningWriters method testClusteredPositionDeleteWriterMultipleSpecs.
@Test
public void testClusteredPositionDeleteWriterMultipleSpecs() throws IOException {
FileWriterFactory<T> writerFactory = newWriterFactory(table.schema());
// add an unpartitioned data file
ImmutableList<T> rows1 = ImmutableList.of(toRow(1, "aaa"), toRow(2, "aaa"), toRow(11, "aaa"));
DataFile dataFile1 = writeData(writerFactory, fileFactory, rows1, table.spec(), null);
table.newFastAppend().appendFile(dataFile1).commit();
// partition by bucket
table.updateSpec().addField(Expressions.bucket("data", 16)).commit();
// add a data file partitioned by bucket
ImmutableList<T> rows2 = ImmutableList.of(toRow(3, "bbb"), toRow(4, "bbb"), toRow(12, "bbb"));
DataFile dataFile2 = writeData(writerFactory, fileFactory, rows2, table.spec(), partitionKey(table.spec(), "bbb"));
table.newFastAppend().appendFile(dataFile2).commit();
// partition by data
table.updateSpec().removeField(Expressions.bucket("data", 16)).addField(Expressions.ref("data")).commit();
// add a data file partitioned by data
ImmutableList<T> rows3 = ImmutableList.of(toRow(5, "ccc"), toRow(13, "ccc"));
DataFile dataFile3 = writeData(writerFactory, fileFactory, rows3, table.spec(), partitionKey(table.spec(), "ccc"));
table.newFastAppend().appendFile(dataFile3).commit();
ClusteredPositionDeleteWriter<T> writer = new ClusteredPositionDeleteWriter<>(writerFactory, fileFactory, table.io(), fileFormat, TARGET_FILE_SIZE);
PartitionSpec unpartitionedSpec = table.specs().get(0);
PartitionSpec bucketSpec = table.specs().get(1);
PartitionSpec identitySpec = table.specs().get(2);
writer.write(positionDelete(dataFile1.path(), 0L, null), unpartitionedSpec, null);
writer.write(positionDelete(dataFile1.path(), 1L, null), unpartitionedSpec, null);
writer.write(positionDelete(dataFile2.path(), 0L, null), bucketSpec, partitionKey(bucketSpec, "bbb"));
writer.write(positionDelete(dataFile2.path(), 1L, null), bucketSpec, partitionKey(bucketSpec, "bbb"));
writer.write(positionDelete(dataFile3.path(), 0L, null), identitySpec, partitionKey(identitySpec, "ccc"));
writer.close();
DeleteWriteResult result = writer.result();
Assert.assertEquals("Must be 3 delete files", 3, result.deleteFiles().size());
Assert.assertEquals("Must reference 3 data files", 3, writer.result().referencedDataFiles().size());
Assert.assertTrue("Must reference data files", writer.result().referencesDataFiles());
RowDelta rowDelta = table.newRowDelta();
result.deleteFiles().forEach(rowDelta::addDeletes);
rowDelta.commit();
List<T> expectedRows = ImmutableList.of(toRow(11, "aaa"), toRow(12, "bbb"), toRow(13, "ccc"));
Assert.assertEquals("Records should match", toSet(expectedRows), actualRowSet("*"));
}
use of org.apache.iceberg.RowDelta in project iceberg by apache.
the class TestPositionDeltaWriters method testPositionDeltaInsertOnly.
@Test
public void testPositionDeltaInsertOnly() throws IOException {
FileWriterFactory<T> writerFactory = newWriterFactory(table.schema());
ClusteredDataWriter<T> insertWriter = new ClusteredDataWriter<>(writerFactory, fileFactory, table.io(), fileFormat, TARGET_FILE_SIZE);
ClusteredDataWriter<T> updateWriter = new ClusteredDataWriter<>(writerFactory, fileFactory, table.io(), fileFormat, TARGET_FILE_SIZE);
ClusteredPositionDeleteWriter<T> deleteWriter = new ClusteredPositionDeleteWriter<>(writerFactory, fileFactory, table.io(), fileFormat, TARGET_FILE_SIZE);
PositionDeltaWriter<T> deltaWriter = new BasePositionDeltaWriter<>(insertWriter, updateWriter, deleteWriter);
deltaWriter.insert(toRow(1, "aaa"), table.spec(), null);
deltaWriter.close();
WriteResult result = deltaWriter.result();
DataFile[] dataFiles = result.dataFiles();
DeleteFile[] deleteFiles = result.deleteFiles();
CharSequence[] referencedDataFiles = result.referencedDataFiles();
Assert.assertEquals("Must be 1 data files", 1, dataFiles.length);
Assert.assertEquals("Must be no delete files", 0, deleteFiles.length);
Assert.assertEquals("Must not reference data files", 0, referencedDataFiles.length);
RowDelta rowDelta = table.newRowDelta();
for (DataFile dataFile : dataFiles) {
rowDelta.addRows(dataFile);
}
rowDelta.commit();
List<T> expectedRows = ImmutableList.of(toRow(1, "aaa"));
Assert.assertEquals("Records should match", toSet(expectedRows), actualRowSet("*"));
}
use of org.apache.iceberg.RowDelta in project hive by apache.
the class HiveIcebergOutputCommitter method commitWrite.
/**
* Creates and commits an Iceberg change with the provided data and delete files.
* If there are no delete files then an Iceberg 'append' is created, otherwise Iceberg 'overwrite' is created.
* @param table The table we are changing
* @param startTime The start time of the commit - used only for logging
* @param results The object containing the new files we would like to add to the table
*/
private void commitWrite(Table table, long startTime, FilesForCommit results) {
if (results.deleteFiles().isEmpty()) {
AppendFiles write = table.newAppend();
results.dataFiles().forEach(write::appendFile);
write.commit();
} else {
RowDelta write = table.newRowDelta();
results.dataFiles().forEach(write::addRows);
results.deleteFiles().forEach(write::addDeletes);
write.commit();
}
LOG.info("Write commit took {} ms for table: {} with {} data and {} delete file(s)", System.currentTimeMillis() - startTime, table, results.dataFiles().size(), results.deleteFiles().size());
LOG.debug("Added files {}", results);
}
Aggregations