Search in sources :

Example 1 with HoodieDeleteBlock

use of org.apache.hudi.common.table.log.block.HoodieDeleteBlock in project hudi by apache.

the class TestHoodieLogFormat method testAvroLogRecordReaderWithInsertDeleteAndRollback.

@ParameterizedTest
@MethodSource("testArguments")
public void testAvroLogRecordReaderWithInsertDeleteAndRollback(ExternalSpillableMap.DiskMapType diskMapType, boolean isCompressionEnabled, boolean readBlocksLazily) throws IOException, URISyntaxException, InterruptedException {
    // Write a Data block and Delete block with same InstantTime (written in same batch)
    Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema());
    // Set a small threshold so that every block is a new version
    Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
    // Write 1
    List<IndexedRecord> records1 = SchemaTestUtil.generateHoodieTestRecords(0, 100);
    List<IndexedRecord> copyOfRecords1 = records1.stream().map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
    Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
    header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
    header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME, "100");
    header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
    HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records1, header);
    writer.appendBlock(dataBlock);
    // Delete 50 keys
    List<HoodieKey> deletedKeys = copyOfRecords1.stream().map(s -> (new HoodieKey(((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(), ((GenericRecord) s).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString()))).collect(Collectors.toList()).subList(0, 50);
    HoodieDeleteBlock deleteBlock = new HoodieDeleteBlock(deletedKeys.toArray(new HoodieKey[50]), header);
    writer.appendBlock(deleteBlock);
    FileCreateUtils.createDeltaCommit(basePath, "100", fs);
    // Write 2 rollback blocks (1 data block + 1 delete bloc) for a failed write
    header.put(HoodieLogBlock.HeaderMetadataType.COMMAND_BLOCK_TYPE, String.valueOf(HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK.ordinal()));
    HoodieCommandBlock commandBlock = new HoodieCommandBlock(header);
    writer.appendBlock(commandBlock);
    writer.appendBlock(commandBlock);
    List<String> allLogFiles = FSUtils.getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, "100").map(s -> s.getPath().toString()).collect(Collectors.toList());
    HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder().withFileSystem(fs).withBasePath(basePath).withLogFilePaths(allLogFiles).withReaderSchema(schema).withLatestInstantTime("100").withMaxMemorySizeInBytes(10240L).withReadBlocksLazily(readBlocksLazily).withReverseReader(false).withBufferSize(bufferSize).withSpillableMapBasePath(BASE_OUTPUT_PATH).withDiskMapType(diskMapType).withBitCaskDiskMapCompressionEnabled(isCompressionEnabled).build();
    assertEquals(0, scanner.getTotalLogRecords(), "We would read 0 records");
    FileCreateUtils.deleteDeltaCommit(basePath, "100", fs);
}
Also used : BeforeEach(org.junit.jupiter.api.BeforeEach) HoodieHFileDataBlock(org.apache.hudi.common.table.log.block.HoodieHFileDataBlock) FileSystem(org.apache.hadoop.fs.FileSystem) URISyntaxException(java.net.URISyntaxException) Assertions.assertNotEquals(org.junit.jupiter.api.Assertions.assertNotEquals) HoodieCommandBlock(org.apache.hudi.common.table.log.block.HoodieCommandBlock) ClosableIterator(org.apache.hudi.common.util.ClosableIterator) FileStatus(org.apache.hadoop.fs.FileStatus) AfterAll(org.junit.jupiter.api.AfterAll) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) BeforeAll(org.junit.jupiter.api.BeforeAll) HoodieDataBlock(org.apache.hudi.common.table.log.block.HoodieDataBlock) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) SchemaTestUtil(org.apache.hudi.common.testutils.SchemaTestUtil) Path(org.apache.hadoop.fs.Path) HoodieParquetDataBlock(org.apache.hudi.common.table.log.block.HoodieParquetDataBlock) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) MethodSource(org.junit.jupiter.params.provider.MethodSource) Schema(org.apache.avro.Schema) Collection(java.util.Collection) Compression(org.apache.hadoop.hbase.io.compress.Compression) Set(java.util.Set) HoodieArchivedLogFile(org.apache.hudi.common.model.HoodieArchivedLogFile) Arguments(org.junit.jupiter.params.provider.Arguments) HoodieCommonTestHarness(org.apache.hudi.common.testutils.HoodieCommonTestHarness) Collectors(java.util.stream.Collectors) Test(org.junit.jupiter.api.Test) UncheckedIOException(java.io.UncheckedIOException) MiniClusterUtil(org.apache.hudi.common.testutils.minicluster.MiniClusterUtil) List(java.util.List) Stream(java.util.stream.Stream) HadoopMapRedUtils(org.apache.hudi.common.testutils.HadoopMapRedUtils) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) HoodieLogBlock(org.apache.hudi.common.table.log.block.HoodieLogBlock) Assertions.assertThrows(org.junit.jupiter.api.Assertions.assertThrows) Assertions.assertNotNull(org.junit.jupiter.api.Assertions.assertNotNull) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) Reader(org.apache.hudi.common.table.log.HoodieLogFormat.Reader) HeaderMetadataType(org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType) Option(org.apache.hudi.common.util.Option) EnumSource(org.junit.jupiter.params.provider.EnumSource) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) CorruptedLogFileException(org.apache.hudi.exception.CorruptedLogFileException) HashSet(java.util.HashSet) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) Writer(org.apache.hudi.common.table.log.HoodieLogFormat.Writer) SchemaTestUtil.getSimpleSchema(org.apache.hudi.common.testutils.SchemaTestUtil.getSimpleSchema) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) ExternalSpillableMap(org.apache.hudi.common.util.collection.ExternalSpillableMap) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) Arguments.arguments(org.junit.jupiter.params.provider.Arguments.arguments) IndexedRecord(org.apache.avro.generic.IndexedRecord) ValueSource(org.junit.jupiter.params.provider.ValueSource) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) HoodieMergedLogRecordScanner(org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner) HoodieLogBlockType(org.apache.hudi.common.table.log.block.HoodieLogBlock.HoodieLogBlockType) AppendResult(org.apache.hudi.common.table.log.AppendResult) IOException(java.io.IOException) HoodieLogFileReader(org.apache.hudi.common.table.log.HoodieLogFileReader) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) FileCreateUtils(org.apache.hudi.common.testutils.FileCreateUtils) BenchmarkCounter(org.apache.parquet.hadoop.util.counters.BenchmarkCounter) AfterEach(org.junit.jupiter.api.AfterEach) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) HoodieAvroDataBlock(org.apache.hudi.common.table.log.block.HoodieAvroDataBlock) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) HoodieDeleteBlock(org.apache.hudi.common.table.log.block.HoodieDeleteBlock) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) HoodieDataBlock(org.apache.hudi.common.table.log.block.HoodieDataBlock) IndexedRecord(org.apache.avro.generic.IndexedRecord) HashMap(java.util.HashMap) HoodieDeleteBlock(org.apache.hudi.common.table.log.block.HoodieDeleteBlock) HoodieMergedLogRecordScanner(org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner) Schema(org.apache.avro.Schema) SchemaTestUtil.getSimpleSchema(org.apache.hudi.common.testutils.SchemaTestUtil.getSimpleSchema) HoodieCommandBlock(org.apache.hudi.common.table.log.block.HoodieCommandBlock) HeaderMetadataType(org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType) HoodieKey(org.apache.hudi.common.model.HoodieKey) GenericRecord(org.apache.avro.generic.GenericRecord) Writer(org.apache.hudi.common.table.log.HoodieLogFormat.Writer) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Example 2 with HoodieDeleteBlock

use of org.apache.hudi.common.table.log.block.HoodieDeleteBlock in project hudi by apache.

the class HoodieBackedTableMetadataWriter method initializeFileGroups.

/**
 * Initialize file groups for a partition. For file listing, we just have one file group.
 *
 * All FileGroups for a given metadata partition has a fixed prefix as per the {@link MetadataPartitionType#getFileIdPrefix()}.
 * Each file group is suffixed with 4 digits with increments of 1 starting with 0000.
 *
 * Lets say we configure 10 file groups for record level index partition, and prefix as "record-index-bucket-"
 * File groups will be named as :
 *    record-index-bucket-0000, .... -> ..., record-index-bucket-0009
 */
private void initializeFileGroups(HoodieTableMetaClient dataMetaClient, MetadataPartitionType metadataPartition, String instantTime, int fileGroupCount) throws IOException {
    final HashMap<HeaderMetadataType, String> blockHeader = new HashMap<>();
    blockHeader.put(HeaderMetadataType.INSTANT_TIME, instantTime);
    // Archival of data table has a dependency on compaction(base files) in metadata table.
    // It is assumed that as of time Tx of base instant (/compaction time) in metadata table,
    // all commits in data table is in sync with metadata table. So, we always start with log file for any fileGroup.
    final HoodieDeleteBlock block = new HoodieDeleteBlock(new HoodieKey[0], blockHeader);
    LOG.info(String.format("Creating %d file groups for partition %s with base fileId %s at instant time %s", fileGroupCount, metadataPartition.getPartitionPath(), metadataPartition.getFileIdPrefix(), instantTime));
    for (int i = 0; i < fileGroupCount; ++i) {
        final String fileGroupFileId = String.format("%s%04d", metadataPartition.getFileIdPrefix(), i);
        try {
            HoodieLogFormat.Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(FSUtils.getPartitionPath(metadataWriteConfig.getBasePath(), metadataPartition.getPartitionPath())).withFileId(fileGroupFileId).overBaseCommit(instantTime).withLogVersion(HoodieLogFile.LOGFILE_BASE_VERSION).withFileSize(0L).withSizeThreshold(metadataWriteConfig.getLogFileMaxSize()).withFs(dataMetaClient.getFs()).withRolloverLogWriteToken(HoodieLogFormat.DEFAULT_WRITE_TOKEN).withLogWriteToken(HoodieLogFormat.DEFAULT_WRITE_TOKEN).withFileExtension(HoodieLogFile.DELTA_EXTENSION).build();
            writer.appendBlock(block);
            writer.close();
        } catch (InterruptedException e) {
            throw new HoodieException("Failed to created fileGroup " + fileGroupFileId + " for partition " + metadataPartition.getPartitionPath(), e);
        }
    }
}
Also used : HashMap(java.util.HashMap) HoodieDeleteBlock(org.apache.hudi.common.table.log.block.HoodieDeleteBlock) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) HeaderMetadataType(org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType) HoodieException(org.apache.hudi.exception.HoodieException)

Example 3 with HoodieDeleteBlock

use of org.apache.hudi.common.table.log.block.HoodieDeleteBlock in project hudi by apache.

the class AbstractHoodieLogRecordReader method processQueuedBlocksForInstant.

/**
 * Process the set of log blocks belonging to the last instant which is read fully.
 */
private void processQueuedBlocksForInstant(Deque<HoodieLogBlock> logBlocks, int numLogFilesSeen, Option<List<String>> keys) throws Exception {
    while (!logBlocks.isEmpty()) {
        LOG.info("Number of remaining logblocks to merge " + logBlocks.size());
        // poll the element at the bottom of the stack since that's the order it was inserted
        HoodieLogBlock lastBlock = logBlocks.pollLast();
        switch(lastBlock.getBlockType()) {
            case AVRO_DATA_BLOCK:
                processDataBlock((HoodieAvroDataBlock) lastBlock, keys);
                break;
            case HFILE_DATA_BLOCK:
                processDataBlock((HoodieHFileDataBlock) lastBlock, keys);
                break;
            case PARQUET_DATA_BLOCK:
                processDataBlock((HoodieParquetDataBlock) lastBlock, keys);
                break;
            case DELETE_BLOCK:
                Arrays.stream(((HoodieDeleteBlock) lastBlock).getKeysToDelete()).forEach(this::processNextDeletedKey);
                break;
            case CORRUPT_BLOCK:
                LOG.warn("Found a corrupt block which was not rolled back");
                break;
            default:
                break;
        }
    }
    // At this step the lastBlocks are consumed. We track approximate progress by number of log-files seen
    progress = numLogFilesSeen - 1 / logFilePaths.size();
}
Also used : HoodieLogBlock(org.apache.hudi.common.table.log.block.HoodieLogBlock) HoodieDeleteBlock(org.apache.hudi.common.table.log.block.HoodieDeleteBlock)

Example 4 with HoodieDeleteBlock

use of org.apache.hudi.common.table.log.block.HoodieDeleteBlock in project hudi by apache.

the class HoodieLogFileReader method readBlock.

// TODO : convert content and block length to long by using ByteBuffer, raw byte [] allows
// for max of Integer size
private HoodieLogBlock readBlock() throws IOException {
    int blockSize;
    try {
        // 1 Read the total size of the block
        blockSize = (int) inputStream.readLong();
    } catch (EOFException | CorruptedLogFileException e) {
        // Create a corrupt block by finding the next MAGIC marker or EOF
        return createCorruptBlock();
    }
    // We may have had a crash which could have written this block partially
    // Skip blockSize in the stream and we should either find a sync marker (start of the next
    // block) or EOF. If we did not find either of it, then this block is a corrupted block.
    boolean isCorrupted = isBlockCorrupted(blockSize);
    if (isCorrupted) {
        return createCorruptBlock();
    }
    // 2. Read the version for this log format
    HoodieLogFormat.LogFormatVersion nextBlockVersion = readVersion();
    // 3. Read the block type for a log block
    HoodieLogBlockType blockType = tryReadBlockType(nextBlockVersion);
    // 4. Read the header for a log block, if present
    Map<HeaderMetadataType, String> header = nextBlockVersion.hasHeader() ? HoodieLogBlock.getLogMetadata(inputStream) : null;
    // 5. Read the content length for the content
    // Fallback to full-block size if no content-length
    // TODO replace w/ hasContentLength
    int contentLength = nextBlockVersion.getVersion() != HoodieLogFormatVersion.DEFAULT_VERSION ? (int) inputStream.readLong() : blockSize;
    // 6. Read the content or skip content based on IO vs Memory trade-off by client
    long contentPosition = inputStream.getPos();
    boolean shouldReadLazily = readBlockLazily && nextBlockVersion.getVersion() != HoodieLogFormatVersion.DEFAULT_VERSION;
    Option<byte[]> content = HoodieLogBlock.tryReadContent(inputStream, contentLength, shouldReadLazily);
    // 7. Read footer if any
    Map<HeaderMetadataType, String> footer = nextBlockVersion.hasFooter() ? HoodieLogBlock.getLogMetadata(inputStream) : null;
    // log file in reverse
    if (nextBlockVersion.hasLogBlockLength()) {
        inputStream.readLong();
    }
    // 9. Read the log block end position in the log file
    long blockEndPos = inputStream.getPos();
    HoodieLogBlock.HoodieLogBlockContentLocation logBlockContentLoc = new HoodieLogBlock.HoodieLogBlockContentLocation(hadoopConf, logFile, contentPosition, contentLength, blockEndPos);
    switch(Objects.requireNonNull(blockType)) {
        case AVRO_DATA_BLOCK:
            if (nextBlockVersion.getVersion() == HoodieLogFormatVersion.DEFAULT_VERSION) {
                return HoodieAvroDataBlock.getBlock(content.get(), readerSchema);
            } else {
                return new HoodieAvroDataBlock(inputStream, content, readBlockLazily, logBlockContentLoc, Option.ofNullable(readerSchema), header, footer, keyField);
            }
        case HFILE_DATA_BLOCK:
            checkState(nextBlockVersion.getVersion() != HoodieLogFormatVersion.DEFAULT_VERSION, String.format("HFile block could not be of version (%d)", HoodieLogFormatVersion.DEFAULT_VERSION));
            return new HoodieHFileDataBlock(inputStream, content, readBlockLazily, logBlockContentLoc, Option.ofNullable(readerSchema), header, footer, enableRecordLookups);
        case PARQUET_DATA_BLOCK:
            checkState(nextBlockVersion.getVersion() != HoodieLogFormatVersion.DEFAULT_VERSION, String.format("Parquet block could not be of version (%d)", HoodieLogFormatVersion.DEFAULT_VERSION));
            return new HoodieParquetDataBlock(inputStream, content, readBlockLazily, logBlockContentLoc, Option.ofNullable(readerSchema), header, footer, keyField);
        case DELETE_BLOCK:
            return new HoodieDeleteBlock(content, inputStream, readBlockLazily, Option.of(logBlockContentLoc), header, footer);
        case COMMAND_BLOCK:
            return new HoodieCommandBlock(content, inputStream, readBlockLazily, Option.of(logBlockContentLoc), header, footer);
        default:
            throw new HoodieNotSupportedException("Unsupported Block " + blockType);
    }
}
Also used : HoodieDeleteBlock(org.apache.hudi.common.table.log.block.HoodieDeleteBlock) CorruptedLogFileException(org.apache.hudi.exception.CorruptedLogFileException) HoodieAvroDataBlock(org.apache.hudi.common.table.log.block.HoodieAvroDataBlock) HoodieLogBlockType(org.apache.hudi.common.table.log.block.HoodieLogBlock.HoodieLogBlockType) HoodieNotSupportedException(org.apache.hudi.exception.HoodieNotSupportedException) HoodieHFileDataBlock(org.apache.hudi.common.table.log.block.HoodieHFileDataBlock) HoodieCommandBlock(org.apache.hudi.common.table.log.block.HoodieCommandBlock) HoodieLogBlock(org.apache.hudi.common.table.log.block.HoodieLogBlock) HoodieParquetDataBlock(org.apache.hudi.common.table.log.block.HoodieParquetDataBlock) HeaderMetadataType(org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType) EOFException(java.io.EOFException)

Example 5 with HoodieDeleteBlock

use of org.apache.hudi.common.table.log.block.HoodieDeleteBlock in project hudi by apache.

the class HoodieAppendHandle method appendDataAndDeleteBlocks.

protected void appendDataAndDeleteBlocks(Map<HeaderMetadataType, String> header) {
    try {
        header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, instantTime);
        header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, writeSchemaWithMetaFields.toString());
        List<HoodieLogBlock> blocks = new ArrayList<>(2);
        if (recordList.size() > 0) {
            String keyField = config.populateMetaFields() ? HoodieRecord.RECORD_KEY_METADATA_FIELD : hoodieTable.getMetaClient().getTableConfig().getRecordKeyFieldProp();
            blocks.add(getBlock(config, pickLogDataBlockFormat(), recordList, header, keyField));
        }
        if (keysToDelete.size() > 0) {
            blocks.add(new HoodieDeleteBlock(keysToDelete.toArray(new HoodieKey[keysToDelete.size()]), header));
        }
        if (blocks.size() > 0) {
            AppendResult appendResult = writer.appendBlocks(blocks);
            processAppendResult(appendResult, recordList);
            recordList.clear();
            keysToDelete.clear();
        }
    } catch (Exception e) {
        throw new HoodieAppendException("Failed while appending records to " + writer.getLogFile().getPath(), e);
    }
}
Also used : HoodieLogBlock(org.apache.hudi.common.table.log.block.HoodieLogBlock) HoodieDeleteBlock(org.apache.hudi.common.table.log.block.HoodieDeleteBlock) ArrayList(java.util.ArrayList) HoodieAppendException(org.apache.hudi.exception.HoodieAppendException) AppendResult(org.apache.hudi.common.table.log.AppendResult) HoodieException(org.apache.hudi.exception.HoodieException) HoodieUpsertException(org.apache.hudi.exception.HoodieUpsertException) HoodieAppendException(org.apache.hudi.exception.HoodieAppendException) IOException(java.io.IOException)

Aggregations

HoodieDeleteBlock (org.apache.hudi.common.table.log.block.HoodieDeleteBlock)8 HoodieLogBlock (org.apache.hudi.common.table.log.block.HoodieLogBlock)6 IOException (java.io.IOException)5 ArrayList (java.util.ArrayList)5 HashMap (java.util.HashMap)5 AppendResult (org.apache.hudi.common.table.log.AppendResult)5 HoodieLogFormat (org.apache.hudi.common.table.log.HoodieLogFormat)5 HoodieAvroDataBlock (org.apache.hudi.common.table.log.block.HoodieAvroDataBlock)5 HoodieCommandBlock (org.apache.hudi.common.table.log.block.HoodieCommandBlock)5 HeaderMetadataType (org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType)5 UncheckedIOException (java.io.UncheckedIOException)4 URISyntaxException (java.net.URISyntaxException)4 Collection (java.util.Collection)4 Collections (java.util.Collections)4 HashSet (java.util.HashSet)4 List (java.util.List)4 Map (java.util.Map)4 Set (java.util.Set)4 Collectors (java.util.stream.Collectors)4 Stream (java.util.stream.Stream)4