use of org.apache.iceberg.RowDelta in project iceberg by apache.
the class IcebergSourceDeleteBenchmark method writePosDeletes.
protected void writePosDeletes(CharSequence path, List<Long> deletedPos, int numNoise) throws IOException {
OutputFileFactory fileFactory = newFileFactory();
SparkFileWriterFactory writerFactory = SparkFileWriterFactory.builderFor(table()).dataFileFormat(fileFormat()).build();
ClusteredPositionDeleteWriter<InternalRow> writer = new ClusteredPositionDeleteWriter<>(writerFactory, fileFactory, table().io(), fileFormat(), TARGET_FILE_SIZE_IN_BYTES);
PartitionSpec unpartitionedSpec = table().specs().get(0);
PositionDelete<InternalRow> positionDelete = PositionDelete.create();
try (ClusteredPositionDeleteWriter<InternalRow> closeableWriter = writer) {
for (Long pos : deletedPos) {
positionDelete.set(path, pos, null);
closeableWriter.write(positionDelete, unpartitionedSpec, null);
for (int i = 0; i < numNoise; i++) {
positionDelete.set(noisePath(path), pos, null);
closeableWriter.write(positionDelete, unpartitionedSpec, null);
}
}
}
RowDelta rowDelta = table().newRowDelta();
writer.result().deleteFiles().forEach(rowDelta::addDeletes);
rowDelta.validateDeletedFiles().commit();
}
use of org.apache.iceberg.RowDelta in project iceberg by apache.
the class TestRewriteDataFilesAction method testBinPackWithDeletes.
@Test
public void testBinPackWithDeletes() throws Exception {
Table table = createTablePartitioned(4, 2);
table.updateProperties().set(TableProperties.FORMAT_VERSION, "2").commit();
shouldHaveFiles(table, 8);
table.refresh();
CloseableIterable<FileScanTask> tasks = table.newScan().planFiles();
List<DataFile> dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file));
int total = (int) dataFiles.stream().mapToLong(ContentFile::recordCount).sum();
RowDelta rowDelta = table.newRowDelta();
// add 1 delete file for data files 0, 1, 2
for (int i = 0; i < 3; i++) {
writePosDeletesToFile(table, dataFiles.get(i), 1).forEach(rowDelta::addDeletes);
}
// add 2 delete files for data files 3, 4
for (int i = 3; i < 5; i++) {
writePosDeletesToFile(table, dataFiles.get(i), 2).forEach(rowDelta::addDeletes);
}
rowDelta.commit();
table.refresh();
List<Object[]> expectedRecords = currentData();
Result result = actions().rewriteDataFiles(table).option(BinPackStrategy.MIN_FILE_SIZE_BYTES, "0").option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Long.toString(Long.MAX_VALUE - 1)).option(BinPackStrategy.MAX_FILE_SIZE_BYTES, Long.toString(Long.MAX_VALUE)).option(BinPackStrategy.DELETE_FILE_THRESHOLD, "2").execute();
Assert.assertEquals("Action should rewrite 2 data files", 2, result.rewrittenDataFilesCount());
List<Object[]> actualRecords = currentData();
assertEquals("Rows must match", expectedRecords, actualRecords);
Assert.assertEquals("7 rows are removed", total - 7, actualRecords.size());
}
use of org.apache.iceberg.RowDelta in project iceberg by apache.
the class TestBaseTaskWriter method testRollIfExceedTargetFileSize.
@Test
public void testRollIfExceedTargetFileSize() throws IOException {
List<Record> records = Lists.newArrayListWithCapacity(8000);
for (int i = 0; i < 2000; i++) {
records.add(createRecord(i, "aaa"));
records.add(createRecord(i, "bbb"));
records.add(createRecord(i, "ccc"));
records.add(createRecord(i, "ddd"));
}
WriteResult result;
try (TaskWriter<Record> taskWriter = createTaskWriter(4)) {
for (Record record : records) {
taskWriter.write(record);
}
result = taskWriter.complete();
Assert.assertEquals(8, result.dataFiles().length);
Assert.assertEquals(0, result.deleteFiles().length);
}
RowDelta rowDelta = table.newRowDelta();
Arrays.stream(result.dataFiles()).forEach(rowDelta::addRows);
rowDelta.commit();
List<Record> expected = Lists.newArrayList();
try (TestTaskWriter taskWriter = createTaskWriter(3)) {
for (Record record : records) {
// ex: UPSERT <0, 'aaa'> to <0, 'AAA'>
taskWriter.delete(record);
int id = record.get(0, Integer.class);
String data = record.get(1, String.class);
Record newRecord = createRecord(id, data.toUpperCase());
expected.add(newRecord);
taskWriter.write(newRecord);
}
result = taskWriter.complete();
Assert.assertEquals(8, result.dataFiles().length);
Assert.assertEquals(8, result.deleteFiles().length);
}
rowDelta = table.newRowDelta();
Arrays.stream(result.dataFiles()).forEach(rowDelta::addRows);
Arrays.stream(result.deleteFiles()).forEach(rowDelta::addDeletes);
rowDelta.commit();
Assert.assertEquals("Should have expected records", expectedRowSet(expected), actualRowSet("*"));
}
use of org.apache.iceberg.RowDelta in project iceberg by apache.
the class TestGenericSortedPosDeleteWriter method testMultipleFlush.
@Test
public void testMultipleFlush() throws IOException {
FileAppenderFactory<Record> appenderFactory = new GenericAppenderFactory(table.schema(), table.spec(), null, null, null);
// It will produce 5 record lists, each list will write into a separate data file:
// The 1th file has: <0 , val-0> , <1 , val-1> , ... , <99 , val-99>
// The 2th file has: <100, val-100> , <101, val-101> , ... , <199, val-199>
// The 3th file has: <200, val-200> , <201, val-201> , ... , <299, val-299>
// The 4th file has: <300, val-300> , <301, val-301> , ... , <399, val-399>
// The 5th file has: <400, val-400> , <401, val-401> , ... , <499, val-499>
List<DataFile> dataFiles = Lists.newArrayList();
for (int fileIndex = 0; fileIndex < 5; fileIndex++) {
List<Record> recordList = Lists.newLinkedList();
for (int recordIndex = 0; recordIndex < 100; recordIndex++) {
int id = fileIndex * 100 + recordIndex;
recordList.add(createRow(id, String.format("val-%s", id)));
}
// Write the records and generate the data file.
dataFiles.add(prepareDataFile(appenderFactory, recordList));
}
// Commit those data files to iceberg table.
RowDelta rowDelta = table.newRowDelta();
dataFiles.forEach(rowDelta::addRows);
rowDelta.commit();
SortedPosDeleteWriter<Record> writer = new SortedPosDeleteWriter<>(appenderFactory, fileFactory, format, null, 50);
try (SortedPosDeleteWriter<Record> closeableWriter = writer) {
for (int pos = 0; pos < 100; pos++) {
for (int fileIndex = 4; fileIndex >= 0; fileIndex--) {
closeableWriter.delete(dataFiles.get(fileIndex).path(), pos);
}
}
}
List<DeleteFile> deleteFiles = writer.complete();
Assert.assertEquals(10, deleteFiles.size());
Schema pathPosSchema = DeleteSchemaUtil.pathPosSchema();
Record record = GenericRecord.create(pathPosSchema);
for (int deleteFileIndex = 0; deleteFileIndex < 10; deleteFileIndex++) {
List<Record> expectedDeletes = Lists.newArrayList();
for (int dataFileIndex = 0; dataFileIndex < 5; dataFileIndex++) {
DataFile dataFile = dataFiles.get(dataFileIndex);
for (long pos = deleteFileIndex * 10; pos < deleteFileIndex * 10 + 10; pos++) {
expectedDeletes.add(record.copy("file_path", dataFile.path(), "pos", pos));
}
}
DeleteFile deleteFile = deleteFiles.get(deleteFileIndex);
Assert.assertEquals(expectedDeletes, readRecordsAsList(pathPosSchema, deleteFile.path()));
}
rowDelta = table.newRowDelta();
deleteFiles.forEach(rowDelta::addDeletes);
rowDelta.commit();
Assert.assertEquals("Should have no record.", expectedRowSet(ImmutableList.of()), actualRowSet("*"));
}
use of org.apache.iceberg.RowDelta in project iceberg by apache.
the class TestPartitioningWriters method testFanoutDataWriterMultiplePartitions.
@Test
public void testFanoutDataWriterMultiplePartitions() throws IOException {
table.updateSpec().addField(Expressions.ref("data")).commit();
FileWriterFactory<T> writerFactory = newWriterFactory(table.schema());
FanoutDataWriter<T> writer = new FanoutDataWriter<>(writerFactory, fileFactory, table.io(), fileFormat, TARGET_FILE_SIZE);
PartitionSpec spec = table.spec();
writer.write(toRow(1, "aaa"), spec, partitionKey(spec, "aaa"));
writer.write(toRow(3, "bbb"), spec, partitionKey(spec, "bbb"));
writer.write(toRow(2, "aaa"), spec, partitionKey(spec, "aaa"));
writer.write(toRow(4, "bbb"), spec, partitionKey(spec, "bbb"));
writer.write(toRow(5, "ccc"), spec, partitionKey(spec, "ccc"));
writer.close();
DataWriteResult result = writer.result();
Assert.assertEquals("Must be 3 data files", 3, result.dataFiles().size());
RowDelta rowDelta = table.newRowDelta();
result.dataFiles().forEach(rowDelta::addRows);
rowDelta.commit();
List<T> expectedRows = ImmutableList.of(toRow(1, "aaa"), toRow(2, "aaa"), toRow(3, "bbb"), toRow(4, "bbb"), toRow(5, "ccc"));
Assert.assertEquals("Records should match", toSet(expectedRows), actualRowSet("*"));
}
Aggregations