Search in sources :

Example 1 with ClusteredPositionDeleteWriter

use of org.apache.iceberg.io.ClusteredPositionDeleteWriter in project iceberg by apache.

the class IcebergSourceDeleteBenchmark method writePosDeletes.

protected void writePosDeletes(CharSequence path, List<Long> deletedPos, int numNoise) throws IOException {
    OutputFileFactory fileFactory = newFileFactory();
    SparkFileWriterFactory writerFactory = SparkFileWriterFactory.builderFor(table()).dataFileFormat(fileFormat()).build();
    ClusteredPositionDeleteWriter<InternalRow> writer = new ClusteredPositionDeleteWriter<>(writerFactory, fileFactory, table().io(), fileFormat(), TARGET_FILE_SIZE_IN_BYTES);
    PartitionSpec unpartitionedSpec = table().specs().get(0);
    PositionDelete<InternalRow> positionDelete = PositionDelete.create();
    try (ClusteredPositionDeleteWriter<InternalRow> closeableWriter = writer) {
        for (Long pos : deletedPos) {
            positionDelete.set(path, pos, null);
            closeableWriter.write(positionDelete, unpartitionedSpec, null);
            for (int i = 0; i < numNoise; i++) {
                positionDelete.set(noisePath(path), pos, null);
                closeableWriter.write(positionDelete, unpartitionedSpec, null);
            }
        }
    }
    RowDelta rowDelta = table().newRowDelta();
    writer.result().deleteFiles().forEach(rowDelta::addDeletes);
    rowDelta.validateDeletedFiles().commit();
}
Also used : OutputFileFactory(org.apache.iceberg.io.OutputFileFactory) ClusteredPositionDeleteWriter(org.apache.iceberg.io.ClusteredPositionDeleteWriter) RowDelta(org.apache.iceberg.RowDelta) PartitionSpec(org.apache.iceberg.PartitionSpec) InternalRow(org.apache.spark.sql.catalyst.InternalRow) GenericInternalRow(org.apache.spark.sql.catalyst.expressions.GenericInternalRow)

Example 2 with ClusteredPositionDeleteWriter

use of org.apache.iceberg.io.ClusteredPositionDeleteWriter in project hive by apache.

the class HiveIcebergBufferedDeleteWriter method close.

@Override
public void close(boolean abort) throws IOException {
    long startTime = System.currentTimeMillis();
    Collection<DeleteFile> deleteFiles = new ConcurrentLinkedQueue<>();
    if (!abort) {
        LOG.info("Delete file flush is started");
        int size = Math.min(buffer.size(), poolSize);
        ExecutorService fileExecutor = fileExecutor(size);
        try {
            Tasks.foreach(buffer.keySet()).retry(3).executeWith(fileExecutor).onFailure((partition, exception) -> LOG.info("Failed to write delete file {}", partition, exception)).run(partition -> {
                PositionDelete<Record> positionDelete = PositionDelete.create();
                PartitioningWriter writerForFiles;
                try (PartitioningWriter writer = new ClusteredPositionDeleteWriter<>(writerFactory, fileFactory, io, format, targetFileSize)) {
                    Map<String, Roaring64Bitmap> deleteRows = buffer.get(partition);
                    for (String filePath : new TreeSet<>(deleteRows.keySet())) {
                        Roaring64Bitmap deletes = deleteRows.get(filePath);
                        PeekableLongIterator longIterator = deletes.getLongIterator();
                        while (longIterator.hasNext()) {
                            long position = longIterator.next();
                            positionDelete.set(filePath, position, null);
                            writer.write(positionDelete, keyToSpec.get(partition), partition);
                        }
                    }
                    // We need the writer object later to get the generated data files
                    writerForFiles = writer;
                }
                deleteFiles.addAll(((DeleteWriteResult) writerForFiles.result()).deleteFiles());
            }, IOException.class);
        } finally {
            fileExecutor.shutdown();
        }
    }
    LOG.info("HiveIcebergBufferedDeleteWriter is closed with abort={}. Created {} delete files and it took {} ns.", abort, deleteFiles.size(), System.currentTimeMillis() - startTime);
    LOG.debug("Delete files written {}", deleteFiles);
    this.filesForCommit = FilesForCommit.onlyDelete(deleteFiles);
}
Also used : PartitioningWriter(org.apache.iceberg.io.PartitioningWriter) LoggerFactory(org.slf4j.LoggerFactory) Writable(org.apache.hadoop.io.Writable) TreeSet(java.util.TreeSet) Map(java.util.Map) GenericRecord(org.apache.iceberg.data.GenericRecord) ClusteredPositionDeleteWriter(org.apache.iceberg.io.ClusteredPositionDeleteWriter) PartitionKey(org.apache.iceberg.PartitionKey) Container(org.apache.iceberg.mr.mapred.Container) ExecutorService(java.util.concurrent.ExecutorService) Roaring64Bitmap(org.roaringbitmap.longlong.Roaring64Bitmap) IcebergAcidUtil(org.apache.iceberg.mr.hive.IcebergAcidUtil) Logger(org.slf4j.Logger) InternalRecordWrapper(org.apache.iceberg.data.InternalRecordWrapper) FileWriterFactory(org.apache.iceberg.io.FileWriterFactory) OutputFileFactory(org.apache.iceberg.io.OutputFileFactory) Collection(java.util.Collection) Maps(org.apache.iceberg.relocated.com.google.common.collect.Maps) ThreadFactoryBuilder(org.apache.iceberg.relocated.com.google.common.util.concurrent.ThreadFactoryBuilder) FilesForCommit(org.apache.iceberg.mr.hive.FilesForCommit) IOException(java.io.IOException) PeekableLongIterator(org.roaringbitmap.longlong.PeekableLongIterator) Schema(org.apache.iceberg.Schema) FileFormat(org.apache.iceberg.FileFormat) Executors(java.util.concurrent.Executors) Record(org.apache.iceberg.data.Record) DeleteWriteResult(org.apache.iceberg.io.DeleteWriteResult) Tasks(org.apache.iceberg.util.Tasks) PartitionSpec(org.apache.iceberg.PartitionSpec) DeleteFile(org.apache.iceberg.DeleteFile) FileIO(org.apache.iceberg.io.FileIO) PositionDelete(org.apache.iceberg.deletes.PositionDelete) ConcurrentLinkedQueue(java.util.concurrent.ConcurrentLinkedQueue) ClusteredPositionDeleteWriter(org.apache.iceberg.io.ClusteredPositionDeleteWriter) PartitioningWriter(org.apache.iceberg.io.PartitioningWriter) PeekableLongIterator(org.roaringbitmap.longlong.PeekableLongIterator) TreeSet(java.util.TreeSet) ExecutorService(java.util.concurrent.ExecutorService) Roaring64Bitmap(org.roaringbitmap.longlong.Roaring64Bitmap) GenericRecord(org.apache.iceberg.data.GenericRecord) Record(org.apache.iceberg.data.Record) ConcurrentLinkedQueue(java.util.concurrent.ConcurrentLinkedQueue) DeleteFile(org.apache.iceberg.DeleteFile)

Example 3 with ClusteredPositionDeleteWriter

use of org.apache.iceberg.io.ClusteredPositionDeleteWriter in project iceberg by apache.

the class WritersBenchmark method writeUnpartitionedClusteredPositionDeleteWriter.

@Benchmark
@Threads(1)
public void writeUnpartitionedClusteredPositionDeleteWriter(Blackhole blackhole) throws IOException {
    FileIO io = table().io();
    OutputFileFactory fileFactory = newFileFactory();
    SparkFileWriterFactory writerFactory = SparkFileWriterFactory.builderFor(table()).dataFileFormat(fileFormat()).build();
    ClusteredPositionDeleteWriter<InternalRow> writer = new ClusteredPositionDeleteWriter<>(writerFactory, fileFactory, io, fileFormat(), TARGET_FILE_SIZE_IN_BYTES);
    PositionDelete<InternalRow> positionDelete = PositionDelete.create();
    try (ClusteredPositionDeleteWriter<InternalRow> closeableWriter = writer) {
        for (InternalRow row : positionDeleteRows) {
            String path = row.getString(0);
            long pos = row.getLong(1);
            positionDelete.set(path, pos, null);
            closeableWriter.write(positionDelete, unpartitionedSpec, null);
        }
    }
    blackhole.consume(writer);
}
Also used : OutputFileFactory(org.apache.iceberg.io.OutputFileFactory) ClusteredPositionDeleteWriter(org.apache.iceberg.io.ClusteredPositionDeleteWriter) InternalRow(org.apache.spark.sql.catalyst.InternalRow) FileIO(org.apache.iceberg.io.FileIO) Threads(org.openjdk.jmh.annotations.Threads) Benchmark(org.openjdk.jmh.annotations.Benchmark)

Aggregations

ClusteredPositionDeleteWriter (org.apache.iceberg.io.ClusteredPositionDeleteWriter)3 OutputFileFactory (org.apache.iceberg.io.OutputFileFactory)3 PartitionSpec (org.apache.iceberg.PartitionSpec)2 FileIO (org.apache.iceberg.io.FileIO)2 InternalRow (org.apache.spark.sql.catalyst.InternalRow)2 IOException (java.io.IOException)1 Collection (java.util.Collection)1 Map (java.util.Map)1 TreeSet (java.util.TreeSet)1 ConcurrentLinkedQueue (java.util.concurrent.ConcurrentLinkedQueue)1 ExecutorService (java.util.concurrent.ExecutorService)1 Executors (java.util.concurrent.Executors)1 Writable (org.apache.hadoop.io.Writable)1 DeleteFile (org.apache.iceberg.DeleteFile)1 FileFormat (org.apache.iceberg.FileFormat)1 PartitionKey (org.apache.iceberg.PartitionKey)1 RowDelta (org.apache.iceberg.RowDelta)1 Schema (org.apache.iceberg.Schema)1 GenericRecord (org.apache.iceberg.data.GenericRecord)1 InternalRecordWrapper (org.apache.iceberg.data.InternalRecordWrapper)1