Search in sources :

Example 1 with HoodieLogBlock

use of org.apache.hudi.common.table.log.block.HoodieLogBlock in project hudi by apache.

the class AbstractHoodieLogRecordReader method scan.

public synchronized void scan(Option<List<String>> keys) {
    currentInstantLogBlocks = new ArrayDeque<>();
    progress = 0.0f;
    totalLogFiles = new AtomicLong(0);
    totalRollbacks = new AtomicLong(0);
    totalCorruptBlocks = new AtomicLong(0);
    totalLogBlocks = new AtomicLong(0);
    totalLogRecords = new AtomicLong(0);
    HoodieLogFormatReader logFormatReaderWrapper = null;
    HoodieTimeline commitsTimeline = this.hoodieTableMetaClient.getCommitsTimeline();
    HoodieTimeline completedInstantsTimeline = commitsTimeline.filterCompletedInstants();
    HoodieTimeline inflightInstantsTimeline = commitsTimeline.filterInflights();
    try {
        // Get the key field based on populate meta fields config
        // and the table type
        final String keyField = getKeyField();
        // Iterate over the paths
        logFormatReaderWrapper = new HoodieLogFormatReader(fs, logFilePaths.stream().map(logFile -> new HoodieLogFile(new Path(logFile))).collect(Collectors.toList()), readerSchema, readBlocksLazily, reverseReader, bufferSize, !enableFullScan, keyField);
        Set<HoodieLogFile> scannedLogFiles = new HashSet<>();
        while (logFormatReaderWrapper.hasNext()) {
            HoodieLogFile logFile = logFormatReaderWrapper.getLogFile();
            LOG.info("Scanning log file " + logFile);
            scannedLogFiles.add(logFile);
            totalLogFiles.set(scannedLogFiles.size());
            // Use the HoodieLogFileReader to iterate through the blocks in the log file
            HoodieLogBlock logBlock = logFormatReaderWrapper.next();
            final String instantTime = logBlock.getLogBlockHeader().get(INSTANT_TIME);
            totalLogBlocks.incrementAndGet();
            if (logBlock.getBlockType() != CORRUPT_BLOCK && !HoodieTimeline.compareTimestamps(logBlock.getLogBlockHeader().get(INSTANT_TIME), HoodieTimeline.LESSER_THAN_OR_EQUALS, this.latestInstantTime)) {
                // hit a block with instant time greater than should be processed, stop processing further
                break;
            }
            if (logBlock.getBlockType() != CORRUPT_BLOCK && logBlock.getBlockType() != COMMAND_BLOCK) {
                if (!completedInstantsTimeline.containsOrBeforeTimelineStarts(instantTime) || inflightInstantsTimeline.containsInstant(instantTime)) {
                    // hit an uncommitted block possibly from a failed write, move to the next one and skip processing this one
                    continue;
                }
                if (instantRange.isPresent() && !instantRange.get().isInRange(instantTime)) {
                    // filter the log block by instant range
                    continue;
                }
            }
            switch(logBlock.getBlockType()) {
                case HFILE_DATA_BLOCK:
                case AVRO_DATA_BLOCK:
                case PARQUET_DATA_BLOCK:
                    LOG.info("Reading a data block from file " + logFile.getPath() + " at instant " + logBlock.getLogBlockHeader().get(INSTANT_TIME));
                    if (isNewInstantBlock(logBlock) && !readBlocksLazily) {
                        // If this is an avro data block belonging to a different commit/instant,
                        // then merge the last blocks and records into the main result
                        processQueuedBlocksForInstant(currentInstantLogBlocks, scannedLogFiles.size(), keys);
                    }
                    // store the current block
                    currentInstantLogBlocks.push(logBlock);
                    break;
                case DELETE_BLOCK:
                    LOG.info("Reading a delete block from file " + logFile.getPath());
                    if (isNewInstantBlock(logBlock) && !readBlocksLazily) {
                        // If this is a delete data block belonging to a different commit/instant,
                        // then merge the last blocks and records into the main result
                        processQueuedBlocksForInstant(currentInstantLogBlocks, scannedLogFiles.size(), keys);
                    }
                    // store deletes so can be rolled back
                    currentInstantLogBlocks.push(logBlock);
                    break;
                case COMMAND_BLOCK:
                    // Consider the following scenario
                    // (Time 0, C1, Task T1) -> Running
                    // (Time 1, C1, Task T1) -> Failed (Wrote either a corrupt block or a correct
                    // DataBlock (B1) with commitTime C1
                    // (Time 2, C1, Task T1.2) -> Running (Task T1 was retried and the attempt number is 2)
                    // (Time 3, C1, Task T1.2) -> Finished (Wrote a correct DataBlock B2)
                    // Now a logFile L1 can have 2 correct Datablocks (B1 and B2) which are the same.
                    // Say, commit C1 eventually failed and a rollback is triggered.
                    // Rollback will write only 1 rollback block (R1) since it assumes one block is
                    // written per ingestion batch for a file but in reality we need to rollback (B1 & B2)
                    // The following code ensures the same rollback block (R1) is used to rollback
                    // both B1 & B2
                    LOG.info("Reading a command block from file " + logFile.getPath());
                    // This is a command block - take appropriate action based on the command
                    HoodieCommandBlock commandBlock = (HoodieCommandBlock) logBlock;
                    String targetInstantForCommandBlock = logBlock.getLogBlockHeader().get(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME);
                    switch(// there can be different types of command blocks
                    commandBlock.getType()) {
                        case ROLLBACK_PREVIOUS_BLOCK:
                            // Rollback the last read log block
                            // Get commit time from last record block, compare with targetCommitTime,
                            // rollback only if equal, this is required in scenarios of invalid/extra
                            // rollback blocks written due to failures during the rollback operation itself
                            // and ensures the same rollback block (R1) is used to rollback both B1 & B2 with
                            // same instant_time
                            int numBlocksRolledBack = 0;
                            totalRollbacks.incrementAndGet();
                            while (!currentInstantLogBlocks.isEmpty()) {
                                HoodieLogBlock lastBlock = currentInstantLogBlocks.peek();
                                // handle corrupt blocks separately since they may not have metadata
                                if (lastBlock.getBlockType() == CORRUPT_BLOCK) {
                                    LOG.info("Rolling back the last corrupted log block read in " + logFile.getPath());
                                    currentInstantLogBlocks.pop();
                                    numBlocksRolledBack++;
                                } else if (targetInstantForCommandBlock.contentEquals(lastBlock.getLogBlockHeader().get(INSTANT_TIME))) {
                                    // rollback last data block or delete block
                                    LOG.info("Rolling back the last log block read in " + logFile.getPath());
                                    currentInstantLogBlocks.pop();
                                    numBlocksRolledBack++;
                                } else if (!targetInstantForCommandBlock.contentEquals(currentInstantLogBlocks.peek().getLogBlockHeader().get(INSTANT_TIME))) {
                                    // invalid or extra rollback block
                                    LOG.warn("TargetInstantTime " + targetInstantForCommandBlock + " invalid or extra rollback command block in " + logFile.getPath());
                                    break;
                                } else {
                                    // this should not happen ideally
                                    LOG.warn("Unable to apply rollback command block in " + logFile.getPath());
                                }
                            }
                            LOG.info("Number of applied rollback blocks " + numBlocksRolledBack);
                            break;
                        default:
                            throw new UnsupportedOperationException("Command type not yet supported.");
                    }
                    break;
                case CORRUPT_BLOCK:
                    LOG.info("Found a corrupt block in " + logFile.getPath());
                    totalCorruptBlocks.incrementAndGet();
                    // If there is a corrupt block - we will assume that this was the next data block
                    currentInstantLogBlocks.push(logBlock);
                    break;
                default:
                    throw new UnsupportedOperationException("Block type not supported yet");
            }
        }
        // merge the last read block when all the blocks are done reading
        if (!currentInstantLogBlocks.isEmpty()) {
            LOG.info("Merging the final data blocks");
            processQueuedBlocksForInstant(currentInstantLogBlocks, scannedLogFiles.size(), keys);
        }
        // Done
        progress = 1.0f;
    } catch (IOException e) {
        LOG.error("Got IOException when reading log file", e);
        throw new HoodieIOException("IOException when reading log file ", e);
    } catch (Exception e) {
        LOG.error("Got exception when reading log file", e);
        throw new HoodieException("Exception when reading log file ", e);
    } finally {
        try {
            if (null != logFormatReaderWrapper) {
                logFormatReaderWrapper.close();
            }
        } catch (IOException ioe) {
            // Eat exception as we do not want to mask the original exception that can happen
            LOG.error("Unable to close log format reader", ioe);
        }
    }
}
Also used : Arrays(java.util.Arrays) HoodieHFileDataBlock(org.apache.hudi.common.table.log.block.HoodieHFileDataBlock) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieException(org.apache.hudi.exception.HoodieException) Option(org.apache.hudi.common.util.Option) HoodieCommandBlock(org.apache.hudi.common.table.log.block.HoodieCommandBlock) ClosableIterator(org.apache.hudi.common.util.ClosableIterator) Deque(java.util.Deque) HashSet(java.util.HashSet) Logger(org.apache.log4j.Logger) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieDataBlock(org.apache.hudi.common.table.log.block.HoodieDataBlock) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieParquetDataBlock(org.apache.hudi.common.table.log.block.HoodieParquetDataBlock) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) IndexedRecord(org.apache.avro.generic.IndexedRecord) CORRUPT_BLOCK(org.apache.hudi.common.table.log.block.HoodieLogBlock.HoodieLogBlockType.CORRUPT_BLOCK) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) GenericRecord(org.apache.avro.generic.GenericRecord) Schema(org.apache.avro.Schema) COMMAND_BLOCK(org.apache.hudi.common.table.log.block.HoodieLogBlock.HoodieLogBlockType.COMMAND_BLOCK) Set(java.util.Set) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) AtomicLong(java.util.concurrent.atomic.AtomicLong) List(java.util.List) HoodieAvroDataBlock(org.apache.hudi.common.table.log.block.HoodieAvroDataBlock) INSTANT_TIME(org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType.INSTANT_TIME) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) ArrayDeque(java.util.ArrayDeque) HoodieDeleteBlock(org.apache.hudi.common.table.log.block.HoodieDeleteBlock) HoodieLogBlock(org.apache.hudi.common.table.log.block.HoodieLogBlock) SpillableMapUtils(org.apache.hudi.common.util.SpillableMapUtils) Collections(java.util.Collections) Pair(org.apache.hudi.common.util.collection.Pair) Path(org.apache.hadoop.fs.Path) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieCommandBlock(org.apache.hudi.common.table.log.block.HoodieCommandBlock) HoodieLogBlock(org.apache.hudi.common.table.log.block.HoodieLogBlock) AtomicLong(java.util.concurrent.atomic.AtomicLong) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HashSet(java.util.HashSet)

Example 2 with HoodieLogBlock

use of org.apache.hudi.common.table.log.block.HoodieLogBlock in project hudi by apache.

the class HoodieLogFormatWriter method appendBlocks.

@Override
public AppendResult appendBlocks(List<HoodieLogBlock> blocks) throws IOException, InterruptedException {
    // Find current version
    HoodieLogFormat.LogFormatVersion currentLogFormatVersion = new HoodieLogFormatVersion(HoodieLogFormat.CURRENT_VERSION);
    FSDataOutputStream originalOutputStream = getOutputStream();
    long startPos = originalOutputStream.getPos();
    long sizeWritten = 0;
    // HUDI-2655. here we wrap originalOutputStream to ensure huge blocks can be correctly written
    FSDataOutputStream outputStream = new FSDataOutputStream(originalOutputStream, new FileSystem.Statistics(fs.getScheme()), startPos);
    for (HoodieLogBlock block : blocks) {
        long startSize = outputStream.size();
        // 1. Write the magic header for the start of the block
        outputStream.write(HoodieLogFormat.MAGIC);
        // bytes for header
        byte[] headerBytes = HoodieLogBlock.getLogMetadataBytes(block.getLogBlockHeader());
        // content bytes
        byte[] content = block.getContentBytes();
        // bytes for footer
        byte[] footerBytes = HoodieLogBlock.getLogMetadataBytes(block.getLogBlockFooter());
        // 2. Write the total size of the block (excluding Magic)
        outputStream.writeLong(getLogBlockLength(content.length, headerBytes.length, footerBytes.length));
        // 3. Write the version of this log block
        outputStream.writeInt(currentLogFormatVersion.getVersion());
        // 4. Write the block type
        outputStream.writeInt(block.getBlockType().ordinal());
        // 5. Write the headers for the log block
        outputStream.write(headerBytes);
        // 6. Write the size of the content block
        outputStream.writeLong(content.length);
        // 7. Write the contents of the data block
        outputStream.write(content);
        // 8. Write the footers for the log block
        outputStream.write(footerBytes);
        // 9. Write the total size of the log block (including magic) which is everything written
        // until now (for reverse pointer)
        // Update: this information is now used in determining if a block is corrupt by comparing to the
        // block size in header. This change assumes that the block size will be the last data written
        // to a block. Read will break if any data is written past this point for a block.
        outputStream.writeLong(outputStream.size() - startSize);
        // HUDI-2655. Check the size written to avoid log blocks whose size overflow.
        if (outputStream.size() == Integer.MAX_VALUE) {
            throw new HoodieIOException("Blocks appended may overflow. Please decrease log block size or log block amount");
        }
        sizeWritten += outputStream.size() - startSize;
    }
    // Flush all blocks to disk
    flush();
    AppendResult result = new AppendResult(logFile, startPos, sizeWritten);
    // roll over if size is past the threshold
    rolloverIfNeeded();
    return result;
}
Also used : HoodieLogBlock(org.apache.hudi.common.table.log.block.HoodieLogBlock) HoodieIOException(org.apache.hudi.exception.HoodieIOException) DistributedFileSystem(org.apache.hadoop.hdfs.DistributedFileSystem) FileSystem(org.apache.hadoop.fs.FileSystem) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream)

Example 3 with HoodieLogBlock

use of org.apache.hudi.common.table.log.block.HoodieLogBlock in project hudi by apache.

the class TableSchemaResolver method readSchemaFromLogFile.

/**
 * Read the schema from the log file on path.
 *
 * @return
 */
public static MessageType readSchemaFromLogFile(FileSystem fs, Path path) throws IOException {
    Reader reader = HoodieLogFormat.newReader(fs, new HoodieLogFile(path), null);
    HoodieDataBlock lastBlock = null;
    while (reader.hasNext()) {
        HoodieLogBlock block = reader.next();
        if (block instanceof HoodieDataBlock) {
            lastBlock = (HoodieDataBlock) block;
        }
    }
    reader.close();
    if (lastBlock != null) {
        return new AvroSchemaConverter().convert(lastBlock.getSchema());
    }
    return null;
}
Also used : HoodieLogBlock(org.apache.hudi.common.table.log.block.HoodieLogBlock) AvroSchemaConverter(org.apache.parquet.avro.AvroSchemaConverter) HoodieDataBlock(org.apache.hudi.common.table.log.block.HoodieDataBlock) Reader(org.apache.hudi.common.table.log.HoodieLogFormat.Reader) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) HoodieHFileReader(org.apache.hudi.io.storage.HoodieHFileReader) HoodieOrcReader(org.apache.hudi.io.storage.HoodieOrcReader) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile)

Example 4 with HoodieLogBlock

use of org.apache.hudi.common.table.log.block.HoodieLogBlock in project hudi by apache.

the class TestHoodieLogFormat method testBasicAppendAndReadInReverse.

@ParameterizedTest
@ValueSource(booleans = { true, false })
public void testBasicAppendAndReadInReverse(boolean readBlocksLazily) throws IOException, URISyntaxException, InterruptedException {
    Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
    Schema schema = getSimpleSchema();
    List<IndexedRecord> records1 = SchemaTestUtil.generateTestRecords(0, 100);
    List<IndexedRecord> copyOfRecords1 = records1.stream().map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
    Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
    header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
    header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
    HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records1, header);
    writer.appendBlock(dataBlock);
    writer.close();
    writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
    List<IndexedRecord> records2 = SchemaTestUtil.generateTestRecords(0, 100);
    List<IndexedRecord> copyOfRecords2 = records2.stream().map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
    dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records2, header);
    writer.appendBlock(dataBlock);
    writer.close();
    // Close and Open again and append 100 more records
    writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
    List<IndexedRecord> records3 = SchemaTestUtil.generateTestRecords(0, 100);
    List<IndexedRecord> copyOfRecords3 = records3.stream().map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
    dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records3, header);
    writer.appendBlock(dataBlock);
    writer.close();
    FileCreateUtils.createDeltaCommit(basePath, "100", fs);
    HoodieLogFileReader reader = new HoodieLogFileReader(fs, new HoodieLogFile(writer.getLogFile().getPath(), fs.getFileStatus(writer.getLogFile().getPath()).getLen()), SchemaTestUtil.getSimpleSchema(), bufferSize, readBlocksLazily, true);
    assertTrue(reader.hasPrev(), "Last block should be available");
    HoodieLogBlock prevBlock = reader.prev();
    HoodieDataBlock dataBlockRead = (HoodieDataBlock) prevBlock;
    List<IndexedRecord> recordsRead1 = getRecords(dataBlockRead);
    assertEquals(copyOfRecords3.size(), recordsRead1.size(), "Third records size should be equal to the written records size");
    assertEquals(copyOfRecords3, recordsRead1, "Both records lists should be the same. (ordering guaranteed)");
    assertTrue(reader.hasPrev(), "Second block should be available");
    prevBlock = reader.prev();
    dataBlockRead = (HoodieDataBlock) prevBlock;
    List<IndexedRecord> recordsRead2 = getRecords(dataBlockRead);
    assertEquals(copyOfRecords2.size(), recordsRead2.size(), "Read records size should be equal to the written records size");
    assertEquals(copyOfRecords2, recordsRead2, "Both records lists should be the same. (ordering guaranteed)");
    assertTrue(reader.hasPrev(), "First block should be available");
    prevBlock = reader.prev();
    dataBlockRead = (HoodieDataBlock) prevBlock;
    List<IndexedRecord> recordsRead3 = getRecords(dataBlockRead);
    assertEquals(copyOfRecords1.size(), recordsRead3.size(), "Read records size should be equal to the written records size");
    assertEquals(copyOfRecords1, recordsRead3, "Both records lists should be the same. (ordering guaranteed)");
    assertFalse(reader.hasPrev());
    reader.close();
}
Also used : BeforeEach(org.junit.jupiter.api.BeforeEach) HoodieHFileDataBlock(org.apache.hudi.common.table.log.block.HoodieHFileDataBlock) FileSystem(org.apache.hadoop.fs.FileSystem) URISyntaxException(java.net.URISyntaxException) Assertions.assertNotEquals(org.junit.jupiter.api.Assertions.assertNotEquals) HoodieCommandBlock(org.apache.hudi.common.table.log.block.HoodieCommandBlock) ClosableIterator(org.apache.hudi.common.util.ClosableIterator) FileStatus(org.apache.hadoop.fs.FileStatus) AfterAll(org.junit.jupiter.api.AfterAll) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) BeforeAll(org.junit.jupiter.api.BeforeAll) HoodieDataBlock(org.apache.hudi.common.table.log.block.HoodieDataBlock) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) SchemaTestUtil(org.apache.hudi.common.testutils.SchemaTestUtil) Path(org.apache.hadoop.fs.Path) HoodieParquetDataBlock(org.apache.hudi.common.table.log.block.HoodieParquetDataBlock) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) MethodSource(org.junit.jupiter.params.provider.MethodSource) Schema(org.apache.avro.Schema) Collection(java.util.Collection) Compression(org.apache.hadoop.hbase.io.compress.Compression) Set(java.util.Set) HoodieArchivedLogFile(org.apache.hudi.common.model.HoodieArchivedLogFile) Arguments(org.junit.jupiter.params.provider.Arguments) HoodieCommonTestHarness(org.apache.hudi.common.testutils.HoodieCommonTestHarness) Collectors(java.util.stream.Collectors) Test(org.junit.jupiter.api.Test) UncheckedIOException(java.io.UncheckedIOException) MiniClusterUtil(org.apache.hudi.common.testutils.minicluster.MiniClusterUtil) List(java.util.List) Stream(java.util.stream.Stream) HadoopMapRedUtils(org.apache.hudi.common.testutils.HadoopMapRedUtils) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) HoodieLogBlock(org.apache.hudi.common.table.log.block.HoodieLogBlock) Assertions.assertThrows(org.junit.jupiter.api.Assertions.assertThrows) Assertions.assertNotNull(org.junit.jupiter.api.Assertions.assertNotNull) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) Reader(org.apache.hudi.common.table.log.HoodieLogFormat.Reader) HeaderMetadataType(org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType) Option(org.apache.hudi.common.util.Option) EnumSource(org.junit.jupiter.params.provider.EnumSource) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) CorruptedLogFileException(org.apache.hudi.exception.CorruptedLogFileException) HashSet(java.util.HashSet) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) Writer(org.apache.hudi.common.table.log.HoodieLogFormat.Writer) SchemaTestUtil.getSimpleSchema(org.apache.hudi.common.testutils.SchemaTestUtil.getSimpleSchema) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) ExternalSpillableMap(org.apache.hudi.common.util.collection.ExternalSpillableMap) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) Arguments.arguments(org.junit.jupiter.params.provider.Arguments.arguments) IndexedRecord(org.apache.avro.generic.IndexedRecord) ValueSource(org.junit.jupiter.params.provider.ValueSource) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) HoodieMergedLogRecordScanner(org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner) HoodieLogBlockType(org.apache.hudi.common.table.log.block.HoodieLogBlock.HoodieLogBlockType) AppendResult(org.apache.hudi.common.table.log.AppendResult) IOException(java.io.IOException) HoodieLogFileReader(org.apache.hudi.common.table.log.HoodieLogFileReader) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) FileCreateUtils(org.apache.hudi.common.testutils.FileCreateUtils) BenchmarkCounter(org.apache.parquet.hadoop.util.counters.BenchmarkCounter) AfterEach(org.junit.jupiter.api.AfterEach) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) HoodieAvroDataBlock(org.apache.hudi.common.table.log.block.HoodieAvroDataBlock) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) HoodieDeleteBlock(org.apache.hudi.common.table.log.block.HoodieDeleteBlock) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) HoodieDataBlock(org.apache.hudi.common.table.log.block.HoodieDataBlock) IndexedRecord(org.apache.avro.generic.IndexedRecord) HashMap(java.util.HashMap) Schema(org.apache.avro.Schema) SchemaTestUtil.getSimpleSchema(org.apache.hudi.common.testutils.SchemaTestUtil.getSimpleSchema) HoodieLogFileReader(org.apache.hudi.common.table.log.HoodieLogFileReader) HoodieLogBlock(org.apache.hudi.common.table.log.block.HoodieLogBlock) HeaderMetadataType(org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) Writer(org.apache.hudi.common.table.log.HoodieLogFormat.Writer) ValueSource(org.junit.jupiter.params.provider.ValueSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 5 with HoodieLogBlock

use of org.apache.hudi.common.table.log.block.HoodieLogBlock in project hudi by apache.

the class TestHoodieLogFormat method testBasicAppendAndRead.

@ParameterizedTest
@EnumSource(names = { "AVRO_DATA_BLOCK", "HFILE_DATA_BLOCK", "PARQUET_DATA_BLOCK" })
public void testBasicAppendAndRead(HoodieLogBlockType dataBlockType) throws IOException, URISyntaxException, InterruptedException {
    Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
    List<IndexedRecord> records1 = SchemaTestUtil.generateTestRecords(0, 100);
    Schema schema = getSimpleSchema();
    List<IndexedRecord> copyOfRecords1 = records1.stream().map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
    Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
    header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
    header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
    HoodieDataBlock dataBlock = getDataBlock(dataBlockType, records1, header);
    writer.appendBlock(dataBlock);
    writer.close();
    writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
    List<IndexedRecord> records2 = SchemaTestUtil.generateTestRecords(0, 100);
    List<IndexedRecord> copyOfRecords2 = records2.stream().map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
    header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
    dataBlock = getDataBlock(dataBlockType, records2, header);
    writer.appendBlock(dataBlock);
    writer.close();
    // Close and Open again and append 100 more records
    writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
    List<IndexedRecord> records3 = SchemaTestUtil.generateTestRecords(0, 100);
    List<IndexedRecord> copyOfRecords3 = records3.stream().map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
    header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
    dataBlock = getDataBlock(dataBlockType, records3, header);
    writer.appendBlock(dataBlock);
    writer.close();
    Reader reader = HoodieLogFormat.newReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema());
    assertTrue(reader.hasNext(), "First block should be available");
    HoodieLogBlock nextBlock = reader.next();
    HoodieDataBlock dataBlockRead = (HoodieDataBlock) nextBlock;
    List<IndexedRecord> recordsRead1 = getRecords(dataBlockRead);
    assertEquals(copyOfRecords1.size(), recordsRead1.size(), "Read records size should be equal to the written records size");
    assertEquals(copyOfRecords1, recordsRead1, "Both records lists should be the same. (ordering guaranteed)");
    assertEquals(dataBlockRead.getSchema(), getSimpleSchema());
    reader.hasNext();
    nextBlock = reader.next();
    dataBlockRead = (HoodieDataBlock) nextBlock;
    List<IndexedRecord> recordsRead2 = getRecords(dataBlockRead);
    assertEquals(copyOfRecords2.size(), recordsRead2.size(), "Read records size should be equal to the written records size");
    assertEquals(copyOfRecords2, recordsRead2, "Both records lists should be the same. (ordering guaranteed)");
    reader.hasNext();
    nextBlock = reader.next();
    dataBlockRead = (HoodieDataBlock) nextBlock;
    List<IndexedRecord> recordsRead3 = getRecords(dataBlockRead);
    assertEquals(copyOfRecords3.size(), recordsRead3.size(), "Read records size should be equal to the written records size");
    assertEquals(copyOfRecords3, recordsRead3, "Both records lists should be the same. (ordering guaranteed)");
    reader.close();
}
Also used : BeforeEach(org.junit.jupiter.api.BeforeEach) HoodieHFileDataBlock(org.apache.hudi.common.table.log.block.HoodieHFileDataBlock) FileSystem(org.apache.hadoop.fs.FileSystem) URISyntaxException(java.net.URISyntaxException) Assertions.assertNotEquals(org.junit.jupiter.api.Assertions.assertNotEquals) HoodieCommandBlock(org.apache.hudi.common.table.log.block.HoodieCommandBlock) ClosableIterator(org.apache.hudi.common.util.ClosableIterator) FileStatus(org.apache.hadoop.fs.FileStatus) AfterAll(org.junit.jupiter.api.AfterAll) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) BeforeAll(org.junit.jupiter.api.BeforeAll) HoodieDataBlock(org.apache.hudi.common.table.log.block.HoodieDataBlock) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) SchemaTestUtil(org.apache.hudi.common.testutils.SchemaTestUtil) Path(org.apache.hadoop.fs.Path) HoodieParquetDataBlock(org.apache.hudi.common.table.log.block.HoodieParquetDataBlock) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) MethodSource(org.junit.jupiter.params.provider.MethodSource) Schema(org.apache.avro.Schema) Collection(java.util.Collection) Compression(org.apache.hadoop.hbase.io.compress.Compression) Set(java.util.Set) HoodieArchivedLogFile(org.apache.hudi.common.model.HoodieArchivedLogFile) Arguments(org.junit.jupiter.params.provider.Arguments) HoodieCommonTestHarness(org.apache.hudi.common.testutils.HoodieCommonTestHarness) Collectors(java.util.stream.Collectors) Test(org.junit.jupiter.api.Test) UncheckedIOException(java.io.UncheckedIOException) MiniClusterUtil(org.apache.hudi.common.testutils.minicluster.MiniClusterUtil) List(java.util.List) Stream(java.util.stream.Stream) HadoopMapRedUtils(org.apache.hudi.common.testutils.HadoopMapRedUtils) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) HoodieLogBlock(org.apache.hudi.common.table.log.block.HoodieLogBlock) Assertions.assertThrows(org.junit.jupiter.api.Assertions.assertThrows) Assertions.assertNotNull(org.junit.jupiter.api.Assertions.assertNotNull) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) Reader(org.apache.hudi.common.table.log.HoodieLogFormat.Reader) HeaderMetadataType(org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType) Option(org.apache.hudi.common.util.Option) EnumSource(org.junit.jupiter.params.provider.EnumSource) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) CorruptedLogFileException(org.apache.hudi.exception.CorruptedLogFileException) HashSet(java.util.HashSet) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) Writer(org.apache.hudi.common.table.log.HoodieLogFormat.Writer) SchemaTestUtil.getSimpleSchema(org.apache.hudi.common.testutils.SchemaTestUtil.getSimpleSchema) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) ExternalSpillableMap(org.apache.hudi.common.util.collection.ExternalSpillableMap) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) Arguments.arguments(org.junit.jupiter.params.provider.Arguments.arguments) IndexedRecord(org.apache.avro.generic.IndexedRecord) ValueSource(org.junit.jupiter.params.provider.ValueSource) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) HoodieMergedLogRecordScanner(org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner) HoodieLogBlockType(org.apache.hudi.common.table.log.block.HoodieLogBlock.HoodieLogBlockType) AppendResult(org.apache.hudi.common.table.log.AppendResult) IOException(java.io.IOException) HoodieLogFileReader(org.apache.hudi.common.table.log.HoodieLogFileReader) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) FileCreateUtils(org.apache.hudi.common.testutils.FileCreateUtils) BenchmarkCounter(org.apache.parquet.hadoop.util.counters.BenchmarkCounter) AfterEach(org.junit.jupiter.api.AfterEach) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) HoodieAvroDataBlock(org.apache.hudi.common.table.log.block.HoodieAvroDataBlock) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) HoodieDeleteBlock(org.apache.hudi.common.table.log.block.HoodieDeleteBlock) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) HoodieLogBlock(org.apache.hudi.common.table.log.block.HoodieLogBlock) HoodieDataBlock(org.apache.hudi.common.table.log.block.HoodieDataBlock) IndexedRecord(org.apache.avro.generic.IndexedRecord) HashMap(java.util.HashMap) Schema(org.apache.avro.Schema) SchemaTestUtil.getSimpleSchema(org.apache.hudi.common.testutils.SchemaTestUtil.getSimpleSchema) HeaderMetadataType(org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType) Reader(org.apache.hudi.common.table.log.HoodieLogFormat.Reader) HoodieLogFileReader(org.apache.hudi.common.table.log.HoodieLogFileReader) Writer(org.apache.hudi.common.table.log.HoodieLogFormat.Writer) EnumSource(org.junit.jupiter.params.provider.EnumSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Aggregations

HoodieLogBlock (org.apache.hudi.common.table.log.block.HoodieLogBlock)21 HoodieDataBlock (org.apache.hudi.common.table.log.block.HoodieDataBlock)16 IndexedRecord (org.apache.avro.generic.IndexedRecord)15 Schema (org.apache.avro.Schema)14 HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)12 Reader (org.apache.hudi.common.table.log.HoodieLogFormat.Reader)12 HeaderMetadataType (org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType)11 ArrayList (java.util.ArrayList)10 HashMap (java.util.HashMap)10 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)10 GenericRecord (org.apache.avro.generic.GenericRecord)9 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)9 FileSystem (org.apache.hadoop.fs.FileSystem)9 HoodieLogFileReader (org.apache.hudi.common.table.log.HoodieLogFileReader)9 HoodieLogFormat (org.apache.hudi.common.table.log.HoodieLogFormat)9 Writer (org.apache.hudi.common.table.log.HoodieLogFormat.Writer)9 HoodieDeleteBlock (org.apache.hudi.common.table.log.block.HoodieDeleteBlock)9 IOException (java.io.IOException)8 List (java.util.List)8 FileStatus (org.apache.hadoop.fs.FileStatus)8