use of org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner in project hudi by apache.
the class TestHoodieLogFormat method testAvroLogRecordReaderMergingMultipleLogFiles.
/*
* During a spark stage failure, when the stage is retried, tasks that are part of the previous attempt
* of the stage would continue to run. As a result two different tasks could be performing the same operation.
* When trying to update the log file, only one of the tasks would succeed (one holding lease on the log file).
*
* In order to make progress in this scenario, second task attempting to update the log file would rollover to
* a new version of the log file. As a result, we might end up with two log files with same set of data records
* present in both of them.
*
* Following uint tests mimic this scenario to ensure that the reader can handle merging multiple log files with
* duplicate data.
*
*/
private void testAvroLogRecordReaderMergingMultipleLogFiles(int numRecordsInLog1, int numRecordsInLog2, ExternalSpillableMap.DiskMapType diskMapType, boolean isCompressionEnabled, boolean readBlocksLazily) {
try {
// Write one Data block with same InstantTime (written in same batch)
Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema());
List<IndexedRecord> records = SchemaTestUtil.generateHoodieTestRecords(0, 101);
List<IndexedRecord> records2 = new ArrayList<>(records);
// Write1 with numRecordsInLog1 records written to log.1
Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records.subList(0, numRecordsInLog1), header);
writer.appendBlock(dataBlock);
// Get the size of the block
long size = writer.getCurrentSize();
writer.close();
// write2 with numRecordsInLog2 records written to log.2
Writer writer2 = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100").withFs(fs).withSizeThreshold(size - 1).build();
Map<HoodieLogBlock.HeaderMetadataType, String> header2 = new HashMap<>();
header2.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
header2.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
HoodieDataBlock dataBlock2 = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records2.subList(0, numRecordsInLog2), header2);
writer2.appendBlock(dataBlock2);
// Get the size of the block
writer2.close();
FileCreateUtils.createDeltaCommit(basePath, "100", fs);
// From the two log files generated, read the records
List<String> allLogFiles = FSUtils.getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, "100").map(s -> s.getPath().toString()).collect(Collectors.toList());
HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder().withFileSystem(fs).withBasePath(basePath).withLogFilePaths(allLogFiles).withReaderSchema(schema).withLatestInstantTime("100").withMaxMemorySizeInBytes(10240L).withReadBlocksLazily(readBlocksLazily).withReverseReader(false).withBufferSize(bufferSize).withSpillableMapBasePath(BASE_OUTPUT_PATH).withDiskMapType(diskMapType).withBitCaskDiskMapCompressionEnabled(isCompressionEnabled).build();
assertEquals(Math.max(numRecordsInLog1, numRecordsInLog2), scanner.getNumMergedRecordsInLog(), "We would read 100 records");
} catch (Exception e) {
e.printStackTrace();
}
}
use of org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner in project hudi by apache.
the class HoodieLogFileCommand method showLogFileRecords.
@CliCommand(value = "show logfile records", help = "Read records from log files")
public String showLogFileRecords(@CliOption(key = { "limit" }, help = "Limit commits", unspecifiedDefaultValue = "10") final Integer limit, @CliOption(key = "logFilePathPattern", mandatory = true, help = "Fully qualified paths for the log files") final String logFilePathPattern, @CliOption(key = "mergeRecords", help = "If the records in the log files should be merged", unspecifiedDefaultValue = "false") final Boolean shouldMerge) throws IOException {
System.out.println("===============> Showing only " + limit + " records <===============");
HoodieTableMetaClient client = HoodieCLI.getTableMetaClient();
FileSystem fs = client.getFs();
List<String> logFilePaths = FSUtils.getGlobStatusExcludingMetaFolder(fs, new Path(logFilePathPattern)).stream().map(status -> status.getPath().toString()).sorted(Comparator.reverseOrder()).collect(Collectors.toList());
// logFilePaths size must > 1
assert logFilePaths.size() > 0 : "There is no log file";
// TODO : readerSchema can change across blocks/log files, fix this inside Scanner
AvroSchemaConverter converter = new AvroSchemaConverter();
// get schema from last log file
Schema readerSchema = converter.convert(Objects.requireNonNull(TableSchemaResolver.readSchemaFromLogFile(fs, new Path(logFilePaths.get(logFilePaths.size() - 1)))));
List<IndexedRecord> allRecords = new ArrayList<>();
if (shouldMerge) {
System.out.println("===========================> MERGING RECORDS <===================");
HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder().withFileSystem(fs).withBasePath(client.getBasePath()).withLogFilePaths(logFilePaths).withReaderSchema(readerSchema).withLatestInstantTime(client.getActiveTimeline().getCommitTimeline().lastInstant().get().getTimestamp()).withReadBlocksLazily(Boolean.parseBoolean(HoodieCompactionConfig.COMPACTION_LAZY_BLOCK_READ_ENABLE.defaultValue())).withReverseReader(Boolean.parseBoolean(HoodieCompactionConfig.COMPACTION_REVERSE_LOG_READ_ENABLE.defaultValue())).withBufferSize(HoodieMemoryConfig.MAX_DFS_STREAM_BUFFER_SIZE.defaultValue()).withMaxMemorySizeInBytes(HoodieMemoryConfig.DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES).withSpillableMapBasePath(HoodieMemoryConfig.SPILLABLE_MAP_BASE_PATH.defaultValue()).withDiskMapType(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.defaultValue()).withBitCaskDiskMapCompressionEnabled(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.defaultValue()).build();
for (HoodieRecord<? extends HoodieRecordPayload> hoodieRecord : scanner) {
Option<IndexedRecord> record = hoodieRecord.getData().getInsertValue(readerSchema);
if (allRecords.size() < limit) {
allRecords.add(record.get());
}
}
} else {
for (String logFile : logFilePaths) {
Schema writerSchema = new AvroSchemaConverter().convert(Objects.requireNonNull(TableSchemaResolver.readSchemaFromLogFile(client.getFs(), new Path(logFile))));
HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(fs, new HoodieLogFile(new Path(logFile)), writerSchema);
// read the avro blocks
while (reader.hasNext()) {
HoodieLogBlock n = reader.next();
if (n instanceof HoodieDataBlock) {
HoodieDataBlock blk = (HoodieDataBlock) n;
try (ClosableIterator<IndexedRecord> recordItr = blk.getRecordItr()) {
recordItr.forEachRemaining(record -> {
if (allRecords.size() < limit) {
allRecords.add(record);
}
});
}
}
}
reader.close();
if (allRecords.size() >= limit) {
break;
}
}
}
String[][] rows = new String[allRecords.size()][];
int i = 0;
for (IndexedRecord record : allRecords) {
String[] data = new String[1];
data[0] = record.toString();
rows[i] = data;
i++;
}
return HoodiePrintHelper.print(new String[] { HoodieTableHeaderFields.HEADER_RECORDS }, rows);
}
use of org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner in project hudi by apache.
the class TestHoodieLogFileCommand method testShowLogFileRecordsWithMerge.
/**
* Test case for 'show logfile records' with merge.
*/
@Test
public void testShowLogFileRecordsWithMerge() throws IOException, InterruptedException, URISyntaxException {
// create commit instant
HoodieTestCommitMetadataGenerator.createCommitFile(tablePath, INSTANT_TIME, HoodieCLI.conf);
// write to path '2015/03/16'.
Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema());
partitionPath = tablePath + Path.SEPARATOR + HoodieTestCommitMetadataGenerator.DEFAULT_SECOND_PARTITION_PATH;
Files.createDirectories(Paths.get(partitionPath));
HoodieLogFormat.Writer writer = null;
try {
// set little threshold to split file.
writer = HoodieLogFormat.newWriterBuilder().onParentPath(new Path(partitionPath)).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-log-fileid1").overBaseCommit(INSTANT_TIME).withFs(fs).withSizeThreshold(500).build();
List<IndexedRecord> records1 = SchemaTestUtil.generateHoodieTestRecords(0, 100);
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, INSTANT_TIME);
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, header, HoodieRecord.RECORD_KEY_METADATA_FIELD);
writer.appendBlock(dataBlock);
} finally {
if (writer != null) {
writer.close();
}
}
CommandResult cr = shell().executeCommand("show logfile records --logFilePathPattern " + partitionPath + "/* --mergeRecords true");
assertTrue(cr.isSuccess());
// get expected result of 10 records.
List<String> logFilePaths = Arrays.stream(fs.globStatus(new Path(partitionPath + "/*"))).map(status -> status.getPath().toString()).collect(Collectors.toList());
HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder().withFileSystem(fs).withBasePath(tablePath).withLogFilePaths(logFilePaths).withReaderSchema(schema).withLatestInstantTime(INSTANT_TIME).withMaxMemorySizeInBytes(HoodieMemoryConfig.DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES).withReadBlocksLazily(Boolean.parseBoolean(HoodieCompactionConfig.COMPACTION_LAZY_BLOCK_READ_ENABLE.defaultValue())).withReverseReader(Boolean.parseBoolean(HoodieCompactionConfig.COMPACTION_REVERSE_LOG_READ_ENABLE.defaultValue())).withBufferSize(HoodieMemoryConfig.MAX_DFS_STREAM_BUFFER_SIZE.defaultValue()).withSpillableMapBasePath(HoodieMemoryConfig.SPILLABLE_MAP_BASE_PATH.defaultValue()).withDiskMapType(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.defaultValue()).withBitCaskDiskMapCompressionEnabled(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.defaultValue()).build();
Iterator<HoodieRecord<? extends HoodieRecordPayload>> records = scanner.iterator();
int num = 0;
int maxSize = 10;
List<IndexedRecord> indexRecords = new ArrayList<>();
while (records.hasNext() && num < maxSize) {
Option<IndexedRecord> hoodieRecord = records.next().getData().getInsertValue(schema);
indexRecords.add(hoodieRecord.get());
num++;
}
String[][] rows = indexRecords.stream().map(r -> new String[] { r.toString() }).toArray(String[][]::new);
assertNotNull(rows);
String expected = HoodiePrintHelper.print(new String[] { HoodieTableHeaderFields.HEADER_RECORDS }, rows);
expected = removeNonWordAndStripSpace(expected);
String got = removeNonWordAndStripSpace(cr.getResult().toString());
assertEquals(expected, got);
}
use of org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner in project hudi by apache.
the class MergeOnReadInputFormat method getLogFileIterator.
private ClosableIterator<RowData> getLogFileIterator(MergeOnReadInputSplit split) {
final Schema tableSchema = new Schema.Parser().parse(tableState.getAvroSchema());
final Schema requiredSchema = new Schema.Parser().parse(tableState.getRequiredAvroSchema());
final GenericRecordBuilder recordBuilder = new GenericRecordBuilder(requiredSchema);
final AvroToRowDataConverters.AvroToRowDataConverter avroToRowDataConverter = AvroToRowDataConverters.createRowConverter(tableState.getRequiredRowType());
final HoodieMergedLogRecordScanner scanner = FormatUtils.logScanner(split, tableSchema, hadoopConf, conf.getBoolean(FlinkOptions.CHANGELOG_ENABLED));
final Iterator<String> logRecordsKeyIterator = scanner.getRecords().keySet().iterator();
final int[] pkOffset = tableState.getPkOffsetsInRequired();
// flag saying whether the pk semantics has been dropped by user specified
// projections. For e.g, if the pk fields are [a, b] but user only select a,
// then the pk semantics is lost.
final boolean pkSemanticLost = Arrays.stream(pkOffset).anyMatch(offset -> offset == -1);
final LogicalType[] pkTypes = pkSemanticLost ? null : tableState.getPkTypes(pkOffset);
final StringToRowDataConverter converter = pkSemanticLost ? null : new StringToRowDataConverter(pkTypes);
return new ClosableIterator<RowData>() {
private RowData currentRecord;
@Override
public boolean hasNext() {
while (logRecordsKeyIterator.hasNext()) {
String curAvroKey = logRecordsKeyIterator.next();
Option<IndexedRecord> curAvroRecord = null;
final HoodieAvroRecord<?> hoodieRecord = (HoodieAvroRecord) scanner.getRecords().get(curAvroKey);
try {
curAvroRecord = hoodieRecord.getData().getInsertValue(tableSchema);
} catch (IOException e) {
throw new HoodieException("Get avro insert value error for key: " + curAvroKey, e);
}
if (!curAvroRecord.isPresent()) {
// delete record found
if (emitDelete && !pkSemanticLost) {
GenericRowData delete = new GenericRowData(tableState.getRequiredRowType().getFieldCount());
final String recordKey = hoodieRecord.getRecordKey();
final String[] pkFields = KeyGenUtils.extractRecordKeys(recordKey);
final Object[] converted = converter.convert(pkFields);
for (int i = 0; i < pkOffset.length; i++) {
delete.setField(pkOffset[i], converted[i]);
}
delete.setRowKind(RowKind.DELETE);
this.currentRecord = delete;
return true;
}
// skipping if the condition is unsatisfied
// continue;
} else {
final IndexedRecord avroRecord = curAvroRecord.get();
final RowKind rowKind = FormatUtils.getRowKindSafely(avroRecord, tableState.getOperationPos());
if (rowKind == RowKind.DELETE && !emitDelete) {
// skip the delete record
continue;
}
GenericRecord requiredAvroRecord = buildAvroRecordBySchema(avroRecord, requiredSchema, requiredPos, recordBuilder);
currentRecord = (RowData) avroToRowDataConverter.convert(requiredAvroRecord);
currentRecord.setRowKind(rowKind);
return true;
}
}
return false;
}
@Override
public RowData next() {
return currentRecord;
}
@Override
public void close() {
scanner.close();
}
};
}
use of org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner in project hudi by apache.
the class TestData method checkWrittenDataMOR.
/**
* Checks the MERGE_ON_READ source data are written as expected.
*
* <p>Note: Replace it with the Flink reader when it is supported.
*
* @param fs The file system
* @param latestInstant The latest committed instant of current table
* @param baseFile The file base to check, should be a directory
* @param expected The expected results mapping, the key should be the partition path
* @param partitions The expected partition number
* @param schema The read schema
*/
public static void checkWrittenDataMOR(FileSystem fs, String latestInstant, File baseFile, Map<String, String> expected, int partitions, Schema schema) {
assert baseFile.isDirectory() : "Base path should be a directory";
FileFilter partitionFilter = file -> !file.getName().startsWith(".");
File[] partitionDirs = baseFile.listFiles(partitionFilter);
assertNotNull(partitionDirs);
assertThat(partitionDirs.length, is(partitions));
for (File partitionDir : partitionDirs) {
File[] dataFiles = partitionDir.listFiles(file -> file.getName().contains(".log.") && !file.getName().startsWith(".."));
assertNotNull(dataFiles);
HoodieMergedLogRecordScanner scanner = getScanner(fs, baseFile.getPath(), Arrays.stream(dataFiles).map(File::getAbsolutePath).sorted(Comparator.naturalOrder()).collect(Collectors.toList()), schema, latestInstant);
List<String> readBuffer = scanner.getRecords().values().stream().map(hoodieRecord -> {
try {
// in case it is a delete
GenericRecord record = (GenericRecord) hoodieRecord.getData().getInsertValue(schema, new Properties()).orElse(null);
return record == null ? (String) null : filterOutVariables(record);
} catch (IOException e) {
throw new RuntimeException(e);
}
}).filter(Objects::nonNull).sorted(Comparator.naturalOrder()).collect(Collectors.toList());
assertThat(readBuffer.toString(), is(expected.get(partitionDir.getName())));
}
}
Aggregations