use of org.apache.hudi.common.table.log.block.HoodieLogBlock in project hudi by apache.
the class AbstractHoodieLogRecordReader method processQueuedBlocksForInstant.
/**
* Process the set of log blocks belonging to the last instant which is read fully.
*/
private void processQueuedBlocksForInstant(Deque<HoodieLogBlock> logBlocks, int numLogFilesSeen, Option<List<String>> keys) throws Exception {
while (!logBlocks.isEmpty()) {
LOG.info("Number of remaining logblocks to merge " + logBlocks.size());
// poll the element at the bottom of the stack since that's the order it was inserted
HoodieLogBlock lastBlock = logBlocks.pollLast();
switch(lastBlock.getBlockType()) {
case AVRO_DATA_BLOCK:
processDataBlock((HoodieAvroDataBlock) lastBlock, keys);
break;
case HFILE_DATA_BLOCK:
processDataBlock((HoodieHFileDataBlock) lastBlock, keys);
break;
case PARQUET_DATA_BLOCK:
processDataBlock((HoodieParquetDataBlock) lastBlock, keys);
break;
case DELETE_BLOCK:
Arrays.stream(((HoodieDeleteBlock) lastBlock).getKeysToDelete()).forEach(this::processNextDeletedKey);
break;
case CORRUPT_BLOCK:
LOG.warn("Found a corrupt block which was not rolled back");
break;
default:
break;
}
}
// At this step the lastBlocks are consumed. We track approximate progress by number of log-files seen
progress = numLogFilesSeen - 1 / logFilePaths.size();
}
use of org.apache.hudi.common.table.log.block.HoodieLogBlock in project hudi by apache.
the class HoodieLogFileReader method readBlock.
// TODO : convert content and block length to long by using ByteBuffer, raw byte [] allows
// for max of Integer size
private HoodieLogBlock readBlock() throws IOException {
int blockSize;
try {
// 1 Read the total size of the block
blockSize = (int) inputStream.readLong();
} catch (EOFException | CorruptedLogFileException e) {
// Create a corrupt block by finding the next MAGIC marker or EOF
return createCorruptBlock();
}
// We may have had a crash which could have written this block partially
// Skip blockSize in the stream and we should either find a sync marker (start of the next
// block) or EOF. If we did not find either of it, then this block is a corrupted block.
boolean isCorrupted = isBlockCorrupted(blockSize);
if (isCorrupted) {
return createCorruptBlock();
}
// 2. Read the version for this log format
HoodieLogFormat.LogFormatVersion nextBlockVersion = readVersion();
// 3. Read the block type for a log block
HoodieLogBlockType blockType = tryReadBlockType(nextBlockVersion);
// 4. Read the header for a log block, if present
Map<HeaderMetadataType, String> header = nextBlockVersion.hasHeader() ? HoodieLogBlock.getLogMetadata(inputStream) : null;
// 5. Read the content length for the content
// Fallback to full-block size if no content-length
// TODO replace w/ hasContentLength
int contentLength = nextBlockVersion.getVersion() != HoodieLogFormatVersion.DEFAULT_VERSION ? (int) inputStream.readLong() : blockSize;
// 6. Read the content or skip content based on IO vs Memory trade-off by client
long contentPosition = inputStream.getPos();
boolean shouldReadLazily = readBlockLazily && nextBlockVersion.getVersion() != HoodieLogFormatVersion.DEFAULT_VERSION;
Option<byte[]> content = HoodieLogBlock.tryReadContent(inputStream, contentLength, shouldReadLazily);
// 7. Read footer if any
Map<HeaderMetadataType, String> footer = nextBlockVersion.hasFooter() ? HoodieLogBlock.getLogMetadata(inputStream) : null;
// log file in reverse
if (nextBlockVersion.hasLogBlockLength()) {
inputStream.readLong();
}
// 9. Read the log block end position in the log file
long blockEndPos = inputStream.getPos();
HoodieLogBlock.HoodieLogBlockContentLocation logBlockContentLoc = new HoodieLogBlock.HoodieLogBlockContentLocation(hadoopConf, logFile, contentPosition, contentLength, blockEndPos);
switch(Objects.requireNonNull(blockType)) {
case AVRO_DATA_BLOCK:
if (nextBlockVersion.getVersion() == HoodieLogFormatVersion.DEFAULT_VERSION) {
return HoodieAvroDataBlock.getBlock(content.get(), readerSchema);
} else {
return new HoodieAvroDataBlock(inputStream, content, readBlockLazily, logBlockContentLoc, Option.ofNullable(readerSchema), header, footer, keyField);
}
case HFILE_DATA_BLOCK:
checkState(nextBlockVersion.getVersion() != HoodieLogFormatVersion.DEFAULT_VERSION, String.format("HFile block could not be of version (%d)", HoodieLogFormatVersion.DEFAULT_VERSION));
return new HoodieHFileDataBlock(inputStream, content, readBlockLazily, logBlockContentLoc, Option.ofNullable(readerSchema), header, footer, enableRecordLookups);
case PARQUET_DATA_BLOCK:
checkState(nextBlockVersion.getVersion() != HoodieLogFormatVersion.DEFAULT_VERSION, String.format("Parquet block could not be of version (%d)", HoodieLogFormatVersion.DEFAULT_VERSION));
return new HoodieParquetDataBlock(inputStream, content, readBlockLazily, logBlockContentLoc, Option.ofNullable(readerSchema), header, footer, keyField);
case DELETE_BLOCK:
return new HoodieDeleteBlock(content, inputStream, readBlockLazily, Option.of(logBlockContentLoc), header, footer);
case COMMAND_BLOCK:
return new HoodieCommandBlock(content, inputStream, readBlockLazily, Option.of(logBlockContentLoc), header, footer);
default:
throw new HoodieNotSupportedException("Unsupported Block " + blockType);
}
}
use of org.apache.hudi.common.table.log.block.HoodieLogBlock in project hudi by apache.
the class HoodieAppendHandle method appendDataAndDeleteBlocks.
protected void appendDataAndDeleteBlocks(Map<HeaderMetadataType, String> header) {
try {
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, instantTime);
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, writeSchemaWithMetaFields.toString());
List<HoodieLogBlock> blocks = new ArrayList<>(2);
if (recordList.size() > 0) {
String keyField = config.populateMetaFields() ? HoodieRecord.RECORD_KEY_METADATA_FIELD : hoodieTable.getMetaClient().getTableConfig().getRecordKeyFieldProp();
blocks.add(getBlock(config, pickLogDataBlockFormat(), recordList, header, keyField));
}
if (keysToDelete.size() > 0) {
blocks.add(new HoodieDeleteBlock(keysToDelete.toArray(new HoodieKey[keysToDelete.size()]), header));
}
if (blocks.size() > 0) {
AppendResult appendResult = writer.appendBlocks(blocks);
processAppendResult(appendResult, recordList);
recordList.clear();
keysToDelete.clear();
}
} catch (Exception e) {
throw new HoodieAppendException("Failed while appending records to " + writer.getLogFile().getPath(), e);
}
}
use of org.apache.hudi.common.table.log.block.HoodieLogBlock in project hudi by apache.
the class TestHoodieLogFormat method testHugeLogFileWrite.
@Test
public void testHugeLogFileWrite() throws IOException, URISyntaxException, InterruptedException {
Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100").withFs(fs).withSizeThreshold(3L * 1024 * 1024 * 1024).build();
Schema schema = getSimpleSchema();
List<IndexedRecord> records = SchemaTestUtil.generateTestRecords(0, 1000);
List<IndexedRecord> copyOfRecords = records.stream().map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
byte[] dataBlockContentBytes = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header).getContentBytes();
HoodieLogBlock.HoodieLogBlockContentLocation logBlockContentLoc = new HoodieLogBlock.HoodieLogBlockContentLocation(new Configuration(), null, 0, dataBlockContentBytes.length, 0);
HoodieDataBlock reusableDataBlock = new HoodieAvroDataBlock(null, Option.ofNullable(dataBlockContentBytes), false, logBlockContentLoc, Option.ofNullable(getSimpleSchema()), header, new HashMap<>(), HoodieRecord.RECORD_KEY_METADATA_FIELD);
long writtenSize = 0;
int logBlockWrittenNum = 0;
while (writtenSize < Integer.MAX_VALUE) {
AppendResult appendResult = writer.appendBlock(reusableDataBlock);
assertTrue(appendResult.size() > 0);
writtenSize += appendResult.size();
logBlockWrittenNum++;
}
writer.close();
Reader reader = HoodieLogFormat.newReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema(), true, true);
assertTrue(reader.hasNext(), "We wrote a block, we should be able to read it");
HoodieLogBlock nextBlock = reader.next();
assertEquals(DEFAULT_DATA_BLOCK_TYPE, nextBlock.getBlockType(), "The next block should be a data block");
HoodieDataBlock dataBlockRead = (HoodieDataBlock) nextBlock;
List<IndexedRecord> recordsRead = getRecords(dataBlockRead);
assertEquals(copyOfRecords.size(), recordsRead.size(), "Read records size should be equal to the written records size");
assertEquals(copyOfRecords, recordsRead, "Both records lists should be the same. (ordering guaranteed)");
int logBlockReadNum = 1;
while (reader.hasNext()) {
reader.next();
logBlockReadNum++;
}
assertEquals(logBlockWrittenNum, logBlockReadNum, "All written log should be correctly found");
reader.close();
// test writing oversize data block which should be rejected
Writer oversizeWriter = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100").withSizeThreshold(3L * 1024 * 1024 * 1024).withFs(fs).build();
List<HoodieLogBlock> dataBlocks = new ArrayList<>(logBlockWrittenNum + 1);
for (int i = 0; i < logBlockWrittenNum + 1; i++) {
dataBlocks.add(reusableDataBlock);
}
assertThrows(HoodieIOException.class, () -> {
oversizeWriter.appendBlocks(dataBlocks);
}, "Blocks appended may overflow. Please decrease log block size or log block amount");
oversizeWriter.close();
}
use of org.apache.hudi.common.table.log.block.HoodieLogBlock in project hudi by apache.
the class TestHoodieLogFormat method testValidateCorruptBlockEndPosition.
@Test
public void testValidateCorruptBlockEndPosition() throws IOException, URISyntaxException, InterruptedException {
Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
List<IndexedRecord> records = SchemaTestUtil.generateTestRecords(0, 100);
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header);
writer.appendBlock(dataBlock);
writer.close();
// Append some arbit byte[] to the end of the log (mimics a partially written commit)
fs = FSUtils.getFs(fs.getUri().toString(), fs.getConf());
FSDataOutputStream outputStream = fs.append(writer.getLogFile().getPath());
// create a block with
outputStream.write(HoodieLogFormat.MAGIC);
// Write out a length that does not confirm with the content
outputStream.writeLong(474);
outputStream.writeInt(HoodieLogBlockType.AVRO_DATA_BLOCK.ordinal());
outputStream.writeInt(HoodieLogFormat.CURRENT_VERSION);
// Write out a length that does not confirm with the content
outputStream.writeLong(400);
// Write out incomplete content
outputStream.write("something-random".getBytes());
// get corrupt block end position
long corruptBlockEndPos = outputStream.getPos();
outputStream.flush();
outputStream.close();
// Append a proper block again
writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
records = SchemaTestUtil.generateTestRecords(0, 10);
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header);
writer.appendBlock(dataBlock);
writer.close();
// Read data and corrupt block
Reader reader = HoodieLogFormat.newReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema());
assertTrue(reader.hasNext(), "First block should be available");
reader.next();
assertTrue(reader.hasNext(), "We should have corrupted block next");
HoodieLogBlock block = reader.next();
assertEquals(HoodieLogBlockType.CORRUPT_BLOCK, block.getBlockType(), "The read block should be a corrupt block");
// validate the corrupt block end position correctly.
assertEquals(corruptBlockEndPos, block.getBlockContentLocation().get().getBlockEndPos());
assertTrue(reader.hasNext(), "Third block should be available");
reader.next();
assertFalse(reader.hasNext(), "There should be no more block left");
reader.close();
}
Aggregations