use of org.apache.hudi.common.table.log.block.HoodieLogBlock in project hudi by apache.
the class TestHoodieLogFormat method testAppendAndReadOnCorruptedLog.
@Test
public void testAppendAndReadOnCorruptedLog() throws IOException, URISyntaxException, InterruptedException {
Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
List<IndexedRecord> records = SchemaTestUtil.generateTestRecords(0, 100);
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header);
writer.appendBlock(dataBlock);
writer.close();
// Append some arbit byte[] to thee end of the log (mimics a partially written commit)
fs = FSUtils.getFs(fs.getUri().toString(), fs.getConf());
FSDataOutputStream outputStream = fs.append(writer.getLogFile().getPath());
// create a block with
outputStream.write(HoodieLogFormat.MAGIC);
// Write out a length that does not confirm with the content
outputStream.writeLong(474);
outputStream.writeInt(HoodieLogBlockType.AVRO_DATA_BLOCK.ordinal());
outputStream.writeInt(HoodieLogFormat.CURRENT_VERSION);
// Write out a length that does not confirm with the content
outputStream.writeLong(400);
// Write out incomplete content
outputStream.write("something-random".getBytes());
outputStream.flush();
outputStream.close();
// Append a proper block that is of the missing length of the corrupted block
writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
records = SchemaTestUtil.generateTestRecords(0, 10);
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header);
writer.appendBlock(dataBlock);
writer.close();
// First round of reads - we should be able to read the first block and then EOF
Reader reader = HoodieLogFormat.newReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema());
assertTrue(reader.hasNext(), "First block should be available");
reader.next();
assertTrue(reader.hasNext(), "We should have corrupted block next");
HoodieLogBlock block = reader.next();
assertEquals(HoodieLogBlockType.CORRUPT_BLOCK, block.getBlockType(), "The read block should be a corrupt block");
assertTrue(reader.hasNext(), "Third block should be available");
reader.next();
assertFalse(reader.hasNext(), "There should be no more block left");
reader.close();
// Simulate another failure back to back
outputStream = fs.append(writer.getLogFile().getPath());
// create a block with
outputStream.write(HoodieLogFormat.MAGIC);
// Write out a length that does not confirm with the content
outputStream.writeLong(1000);
outputStream.writeInt(HoodieLogBlockType.AVRO_DATA_BLOCK.ordinal());
outputStream.writeInt(HoodieLogFormat.CURRENT_VERSION);
// Write out a length that does not confirm with the content
outputStream.writeLong(500);
// Write out some bytes
outputStream.write("something-else-random".getBytes());
outputStream.flush();
outputStream.close();
// Should be able to append a new block
writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
records = SchemaTestUtil.generateTestRecords(0, 100);
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header);
writer.appendBlock(dataBlock);
writer.close();
// Second round of reads - we should be able to read the first and last block
reader = HoodieLogFormat.newReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema());
assertTrue(reader.hasNext(), "First block should be available");
reader.next();
assertTrue(reader.hasNext(), "We should get the 1st corrupted block next");
reader.next();
assertTrue(reader.hasNext(), "Third block should be available");
reader.next();
assertTrue(reader.hasNext(), "We should get the 2nd corrupted block next");
block = reader.next();
assertEquals(HoodieLogBlockType.CORRUPT_BLOCK, block.getBlockType(), "The read block should be a corrupt block");
assertTrue(reader.hasNext(), "We should get the last block next");
reader.next();
assertFalse(reader.hasNext(), "We should have no more blocks left");
reader.close();
}
use of org.apache.hudi.common.table.log.block.HoodieLogBlock in project hudi by apache.
the class TestHoodieLogFormat method testBasicAppendAndTraverseInReverse.
@ParameterizedTest
@ValueSource(booleans = { true, false })
public void testBasicAppendAndTraverseInReverse(boolean readBlocksLazily) throws IOException, URISyntaxException, InterruptedException {
Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
Schema schema = getSimpleSchema();
List<IndexedRecord> records1 = SchemaTestUtil.generateTestRecords(0, 100);
List<IndexedRecord> copyOfRecords1 = records1.stream().map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records1, header);
writer.appendBlock(dataBlock);
writer.close();
writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
List<IndexedRecord> records2 = SchemaTestUtil.generateTestRecords(0, 100);
dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records2, header);
writer.appendBlock(dataBlock);
writer.close();
// Close and Open again and append 100 more records
writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
List<IndexedRecord> records3 = SchemaTestUtil.generateTestRecords(0, 100);
dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records3, header);
writer.appendBlock(dataBlock);
writer.close();
FileCreateUtils.createDeltaCommit(basePath, "100", fs);
HoodieLogFileReader reader = new HoodieLogFileReader(fs, new HoodieLogFile(writer.getLogFile().getPath(), fs.getFileStatus(writer.getLogFile().getPath()).getLen()), SchemaTestUtil.getSimpleSchema(), bufferSize, readBlocksLazily, true);
assertTrue(reader.hasPrev(), "Third block should be available");
reader.moveToPrev();
assertTrue(reader.hasPrev(), "Second block should be available");
reader.moveToPrev();
// After moving twice, this last reader.prev() should read the First block written
assertTrue(reader.hasPrev(), "First block should be available");
HoodieLogBlock prevBlock = reader.prev();
HoodieDataBlock dataBlockRead = (HoodieDataBlock) prevBlock;
List<IndexedRecord> recordsRead = getRecords(dataBlockRead);
assertEquals(copyOfRecords1.size(), recordsRead.size(), "Read records size should be equal to the written records size");
assertEquals(copyOfRecords1, recordsRead, "Both records lists should be the same. (ordering guaranteed)");
assertFalse(reader.hasPrev());
reader.close();
}
use of org.apache.hudi.common.table.log.block.HoodieLogBlock in project hudi by apache.
the class TestHoodieLogFormat method testDataBlockFormatAppendAndReadWithProjectedSchema.
@ParameterizedTest
@EnumSource(names = { "AVRO_DATA_BLOCK", "HFILE_DATA_BLOCK", "PARQUET_DATA_BLOCK" })
public void testDataBlockFormatAppendAndReadWithProjectedSchema(HoodieLogBlockType dataBlockType) throws IOException, URISyntaxException, InterruptedException {
Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
List<GenericRecord> records = SchemaTestUtil.generateTestGenericRecords(0, 1000);
Schema schema = getSimpleSchema();
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<HoodieLogBlock.HeaderMetadataType, String>() {
{
put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
}
};
// Init Benchmark to report number of bytes actually read from the Block
BenchmarkCounter.initCounterFromReporter(HadoopMapRedUtils.createTestReporter(), fs.getConf());
// NOTE: Have to use this ugly hack since List generic is not covariant in its type param
HoodieDataBlock dataBlock = getDataBlock(dataBlockType, (List<IndexedRecord>) (List) records, header);
writer.appendBlock(dataBlock);
writer.close();
Schema projectedSchema = HoodieAvroUtils.generateProjectionSchema(schema, Collections.singletonList("name"));
List<GenericRecord> projectedRecords = HoodieAvroUtils.rewriteRecords(records, projectedSchema);
try (Reader reader = HoodieLogFormat.newReader(fs, writer.getLogFile(), projectedSchema, true, false)) {
assertTrue(reader.hasNext(), "First block should be available");
HoodieLogBlock nextBlock = reader.next();
HoodieDataBlock dataBlockRead = (HoodieDataBlock) nextBlock;
Map<HoodieLogBlockType, Integer> expectedReadBytes = new HashMap<HoodieLogBlockType, Integer>() {
{
// not supported
put(HoodieLogBlockType.AVRO_DATA_BLOCK, 0);
// not supported
put(HoodieLogBlockType.HFILE_DATA_BLOCK, 0);
put(HoodieLogBlockType.PARQUET_DATA_BLOCK, 2605);
}
};
List<IndexedRecord> recordsRead = getRecords(dataBlockRead);
assertEquals(projectedRecords.size(), recordsRead.size(), "Read records size should be equal to the written records size");
assertEquals(projectedRecords, recordsRead, "Both records lists should be the same. (ordering guaranteed)");
assertEquals(dataBlockRead.getSchema(), projectedSchema);
int bytesRead = (int) BenchmarkCounter.getBytesRead();
assertEquals(expectedReadBytes.get(dataBlockType), bytesRead, "Read bytes have to match");
}
}
use of org.apache.hudi.common.table.log.block.HoodieLogBlock in project hudi by apache.
the class TestHoodieLogFormat method testBasicWriteAndScan.
@Test
public void testBasicWriteAndScan() throws IOException, URISyntaxException, InterruptedException {
Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
Schema schema = getSimpleSchema();
List<IndexedRecord> records = SchemaTestUtil.generateTestRecords(0, 100);
List<IndexedRecord> copyOfRecords = records.stream().map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header);
writer.appendBlock(dataBlock);
writer.close();
Reader reader = HoodieLogFormat.newReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema());
assertTrue(reader.hasNext(), "We wrote a block, we should be able to read it");
HoodieLogBlock nextBlock = reader.next();
assertEquals(DEFAULT_DATA_BLOCK_TYPE, nextBlock.getBlockType(), "The next block should be a data block");
HoodieDataBlock dataBlockRead = (HoodieDataBlock) nextBlock;
List<IndexedRecord> recordsRead = getRecords(dataBlockRead);
assertEquals(copyOfRecords.size(), recordsRead.size(), "Read records size should be equal to the written records size");
assertEquals(copyOfRecords, recordsRead, "Both records lists should be the same. (ordering guaranteed)");
reader.close();
}
use of org.apache.hudi.common.table.log.block.HoodieLogBlock in project hudi by apache.
the class HoodieLogFileCommand method showLogFileRecords.
@CliCommand(value = "show logfile records", help = "Read records from log files")
public String showLogFileRecords(@CliOption(key = { "limit" }, help = "Limit commits", unspecifiedDefaultValue = "10") final Integer limit, @CliOption(key = "logFilePathPattern", mandatory = true, help = "Fully qualified paths for the log files") final String logFilePathPattern, @CliOption(key = "mergeRecords", help = "If the records in the log files should be merged", unspecifiedDefaultValue = "false") final Boolean shouldMerge) throws IOException {
System.out.println("===============> Showing only " + limit + " records <===============");
HoodieTableMetaClient client = HoodieCLI.getTableMetaClient();
FileSystem fs = client.getFs();
List<String> logFilePaths = FSUtils.getGlobStatusExcludingMetaFolder(fs, new Path(logFilePathPattern)).stream().map(status -> status.getPath().toString()).sorted(Comparator.reverseOrder()).collect(Collectors.toList());
// logFilePaths size must > 1
assert logFilePaths.size() > 0 : "There is no log file";
// TODO : readerSchema can change across blocks/log files, fix this inside Scanner
AvroSchemaConverter converter = new AvroSchemaConverter();
// get schema from last log file
Schema readerSchema = converter.convert(Objects.requireNonNull(TableSchemaResolver.readSchemaFromLogFile(fs, new Path(logFilePaths.get(logFilePaths.size() - 1)))));
List<IndexedRecord> allRecords = new ArrayList<>();
if (shouldMerge) {
System.out.println("===========================> MERGING RECORDS <===================");
HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder().withFileSystem(fs).withBasePath(client.getBasePath()).withLogFilePaths(logFilePaths).withReaderSchema(readerSchema).withLatestInstantTime(client.getActiveTimeline().getCommitTimeline().lastInstant().get().getTimestamp()).withReadBlocksLazily(Boolean.parseBoolean(HoodieCompactionConfig.COMPACTION_LAZY_BLOCK_READ_ENABLE.defaultValue())).withReverseReader(Boolean.parseBoolean(HoodieCompactionConfig.COMPACTION_REVERSE_LOG_READ_ENABLE.defaultValue())).withBufferSize(HoodieMemoryConfig.MAX_DFS_STREAM_BUFFER_SIZE.defaultValue()).withMaxMemorySizeInBytes(HoodieMemoryConfig.DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES).withSpillableMapBasePath(HoodieMemoryConfig.SPILLABLE_MAP_BASE_PATH.defaultValue()).withDiskMapType(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.defaultValue()).withBitCaskDiskMapCompressionEnabled(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.defaultValue()).build();
for (HoodieRecord<? extends HoodieRecordPayload> hoodieRecord : scanner) {
Option<IndexedRecord> record = hoodieRecord.getData().getInsertValue(readerSchema);
if (allRecords.size() < limit) {
allRecords.add(record.get());
}
}
} else {
for (String logFile : logFilePaths) {
Schema writerSchema = new AvroSchemaConverter().convert(Objects.requireNonNull(TableSchemaResolver.readSchemaFromLogFile(client.getFs(), new Path(logFile))));
HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(fs, new HoodieLogFile(new Path(logFile)), writerSchema);
// read the avro blocks
while (reader.hasNext()) {
HoodieLogBlock n = reader.next();
if (n instanceof HoodieDataBlock) {
HoodieDataBlock blk = (HoodieDataBlock) n;
try (ClosableIterator<IndexedRecord> recordItr = blk.getRecordItr()) {
recordItr.forEachRemaining(record -> {
if (allRecords.size() < limit) {
allRecords.add(record);
}
});
}
}
}
reader.close();
if (allRecords.size() >= limit) {
break;
}
}
}
String[][] rows = new String[allRecords.size()][];
int i = 0;
for (IndexedRecord record : allRecords) {
String[] data = new String[1];
data[0] = record.toString();
rows[i] = data;
i++;
}
return HoodiePrintHelper.print(new String[] { HoodieTableHeaderFields.HEADER_RECORDS }, rows);
}
Aggregations