use of org.apache.hudi.common.model.HoodieLogFile in project hudi by apache.
the class HoodieLogFileCommand method showLogFileCommits.
@CliCommand(value = "show logfile metadata", help = "Read commit metadata from log files")
public String showLogFileCommits(@CliOption(key = "logFilePathPattern", mandatory = true, help = "Fully qualified path for the log file") final String logFilePathPattern, @CliOption(key = { "limit" }, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit, @CliOption(key = { "sortBy" }, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, @CliOption(key = { "desc" }, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, @CliOption(key = { "headeronly" }, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly) throws IOException {
FileSystem fs = HoodieCLI.getTableMetaClient().getFs();
List<String> logFilePaths = FSUtils.getGlobStatusExcludingMetaFolder(fs, new Path(logFilePathPattern)).stream().map(status -> status.getPath().toString()).collect(Collectors.toList());
Map<String, List<Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType, String>>, Integer>>> commitCountAndMetadata = new HashMap<>();
int numCorruptBlocks = 0;
int dummyInstantTimeCount = 0;
for (String logFilePath : logFilePaths) {
FileStatus[] fsStatus = fs.listStatus(new Path(logFilePath));
Schema writerSchema = new AvroSchemaConverter().convert(Objects.requireNonNull(TableSchemaResolver.readSchemaFromLogFile(fs, new Path(logFilePath))));
Reader reader = HoodieLogFormat.newReader(fs, new HoodieLogFile(fsStatus[0].getPath()), writerSchema);
// read the avro blocks
while (reader.hasNext()) {
HoodieLogBlock n = reader.next();
String instantTime;
AtomicInteger recordCount = new AtomicInteger(0);
if (n instanceof HoodieCorruptBlock) {
try {
instantTime = n.getLogBlockHeader().get(HeaderMetadataType.INSTANT_TIME);
if (instantTime == null) {
throw new Exception("Invalid instant time " + instantTime);
}
} catch (Exception e) {
numCorruptBlocks++;
instantTime = "corrupt_block_" + numCorruptBlocks;
// could not read metadata for corrupt block
}
} else {
instantTime = n.getLogBlockHeader().get(HeaderMetadataType.INSTANT_TIME);
if (instantTime == null) {
// This can happen when reading archived commit files since they were written without any instant time
dummyInstantTimeCount++;
instantTime = "dummy_instant_time_" + dummyInstantTimeCount;
}
if (n instanceof HoodieDataBlock) {
try (ClosableIterator<IndexedRecord> recordItr = ((HoodieDataBlock) n).getRecordItr()) {
recordItr.forEachRemaining(r -> recordCount.incrementAndGet());
}
}
}
if (commitCountAndMetadata.containsKey(instantTime)) {
commitCountAndMetadata.get(instantTime).add(new Tuple3<>(n.getBlockType(), new Tuple2<>(n.getLogBlockHeader(), n.getLogBlockFooter()), recordCount.get()));
} else {
List<Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType, String>>, Integer>> list = new ArrayList<>();
list.add(new Tuple3<>(n.getBlockType(), new Tuple2<>(n.getLogBlockHeader(), n.getLogBlockFooter()), recordCount.get()));
commitCountAndMetadata.put(instantTime, list);
}
}
reader.close();
}
List<Comparable[]> rows = new ArrayList<>();
ObjectMapper objectMapper = new ObjectMapper();
for (Map.Entry<String, List<Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType, String>>, Integer>>> entry : commitCountAndMetadata.entrySet()) {
String instantTime = entry.getKey();
for (Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType, String>>, Integer> tuple3 : entry.getValue()) {
Comparable[] output = new Comparable[5];
output[0] = instantTime;
output[1] = tuple3._3();
output[2] = tuple3._1().toString();
output[3] = objectMapper.writeValueAsString(tuple3._2()._1());
output[4] = objectMapper.writeValueAsString(tuple3._2()._2());
rows.add(output);
}
}
TableHeader header = new TableHeader().addTableHeaderField(HoodieTableHeaderFields.HEADER_INSTANT_TIME).addTableHeaderField(HoodieTableHeaderFields.HEADER_RECORD_COUNT).addTableHeaderField(HoodieTableHeaderFields.HEADER_BLOCK_TYPE).addTableHeaderField(HoodieTableHeaderFields.HEADER_HEADER_METADATA).addTableHeaderField(HoodieTableHeaderFields.HEADER_FOOTER_METADATA);
return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows);
}
use of org.apache.hudi.common.model.HoodieLogFile in project hudi by apache.
the class ArchivedCommitsCommand method showCommits.
@CliCommand(value = "show archived commits", help = "Read commits from archived files and show details")
public String showCommits(@CliOption(key = { "skipMetadata" }, help = "Skip displaying commit metadata", unspecifiedDefaultValue = "true") boolean skipMetadata, @CliOption(key = { "limit" }, help = "Limit commits", unspecifiedDefaultValue = "10") final Integer limit, @CliOption(key = { "sortBy" }, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, @CliOption(key = { "desc" }, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, @CliOption(key = { "headeronly" }, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly) throws IOException {
System.out.println("===============> Showing only " + limit + " archived commits <===============");
HoodieTableMetaClient metaClient = HoodieCLI.getTableMetaClient();
String basePath = metaClient.getBasePath();
Path archivePath = new Path(metaClient.getArchivePath() + "/.commits_.archive*");
FileStatus[] fsStatuses = FSUtils.getFs(basePath, HoodieCLI.conf).globStatus(archivePath);
List<Comparable[]> allCommits = new ArrayList<>();
for (FileStatus fs : fsStatuses) {
// read the archived file
HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(FSUtils.getFs(basePath, HoodieCLI.conf), new HoodieLogFile(fs.getPath()), HoodieArchivedMetaEntry.getClassSchema());
List<IndexedRecord> readRecords = new ArrayList<>();
// read the avro blocks
while (reader.hasNext()) {
HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next();
try (ClosableIterator<IndexedRecord> recordItr = blk.getRecordItr()) {
recordItr.forEachRemaining(readRecords::add);
}
}
List<Comparable[]> readCommits = readRecords.stream().map(r -> (GenericRecord) r).map(r -> readCommit(r, skipMetadata)).collect(Collectors.toList());
allCommits.addAll(readCommits);
reader.close();
}
TableHeader header = new TableHeader().addTableHeaderField("CommitTime").addTableHeaderField("CommitType");
if (!skipMetadata) {
header = header.addTableHeaderField("CommitDetails");
}
return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, allCommits);
}
use of org.apache.hudi.common.model.HoodieLogFile in project hudi by apache.
the class TestHoodieRealtimeRecordReader method testSchemaEvolutionAndRollbackBlockInLastLogFile.
@ParameterizedTest
@MethodSource("testArguments")
public void testSchemaEvolutionAndRollbackBlockInLastLogFile(ExternalSpillableMap.DiskMapType diskMapType, boolean isCompressionEnabled) throws Exception {
// initial commit
List<HoodieLogFile> logFiles = new ArrayList<>();
Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema());
HoodieTestUtils.init(hadoopConf, basePath.toString(), HoodieTableType.MERGE_ON_READ);
String instantTime = "100";
int numberOfRecords = 100;
int numberOfLogRecords = numberOfRecords / 2;
File partitionDir = InputFormatTestUtil.prepareSimpleParquetTable(basePath, schema, 1, numberOfRecords, instantTime, HoodieTableType.MERGE_ON_READ);
HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata(Collections.emptyList(), Collections.emptyMap(), Option.empty(), WriteOperationType.UPSERT, schema.toString(), HoodieTimeline.COMMIT_ACTION);
FileCreateUtils.createCommit(basePath.toString(), instantTime, Option.of(commitMetadata));
// Add the paths
FileInputFormat.setInputPaths(baseJobConf, partitionDir.getPath());
List<Field> firstSchemaFields = schema.getFields();
// update files and generate new log file but don't commit
schema = SchemaTestUtil.getComplexEvolvedSchema();
String newCommitTime = "101";
HoodieLogFormat.Writer writer = InputFormatTestUtil.writeDataBlockToLogFile(partitionDir, fs, schema, "fileid0", instantTime, newCommitTime, numberOfLogRecords, 0, 1);
long size = writer.getCurrentSize();
logFiles.add(writer.getLogFile());
writer.close();
assertTrue(size > 0, "block - size should be > 0");
// write rollback for the previous block in new log file version
newCommitTime = "102";
writer = InputFormatTestUtil.writeRollbackBlockToLogFile(partitionDir, fs, schema, "fileid0", instantTime, newCommitTime, "101", 1);
logFiles.add(writer.getLogFile());
writer.close();
commitMetadata = CommitUtils.buildMetadata(Collections.emptyList(), Collections.emptyMap(), Option.empty(), WriteOperationType.UPSERT, schema.toString(), HoodieTimeline.DELTA_COMMIT_ACTION);
FileCreateUtils.createDeltaCommit(basePath.toString(), instantTime, commitMetadata);
// create a split with baseFile (parquet file written earlier) and new log file(s)
HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(new FileSplit(new Path(partitionDir + "/fileid0_1_" + instantTime + ".parquet"), 0, 1, baseJobConf), basePath.toUri().toString(), logFiles, newCommitTime, false, Option.empty());
// create a RecordReader to be used by HoodieRealtimeRecordReader
RecordReader<NullWritable, ArrayWritable> reader = new MapredParquetInputFormat().getRecordReader(new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), baseJobConf, null);
JobConf jobConf = new JobConf(baseJobConf);
List<Schema.Field> fields = schema.getFields();
assertFalse(firstSchemaFields.containsAll(fields));
// Try to read all the fields passed by the new schema
setHiveColumnNameProps(fields, jobConf, true);
jobConf.setEnum(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.key(), diskMapType);
jobConf.setBoolean(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.key(), isCompressionEnabled);
HoodieRealtimeRecordReader recordReader;
try {
// validate record reader compaction
recordReader = new HoodieRealtimeRecordReader(split, jobConf, reader);
throw new RuntimeException("should've failed the previous line");
} catch (HoodieException e) {
// expected, field not found since the data written with the evolved schema was rolled back
}
// Try to read all the fields passed by the new schema
setHiveColumnNameProps(firstSchemaFields, jobConf, true);
// This time read only the fields which are part of parquet
recordReader = new HoodieRealtimeRecordReader(split, jobConf, reader);
// use reader to read base Parquet File and log file
NullWritable key = recordReader.createKey();
ArrayWritable value = recordReader.createValue();
while (recordReader.next(key, value)) {
// keep reading
}
reader.close();
}
use of org.apache.hudi.common.model.HoodieLogFile in project hudi by apache.
the class TestHoodieRealtimeRecordReader method testSchemaEvolution.
@Test
public void testSchemaEvolution() throws Exception {
ExternalSpillableMap.DiskMapType diskMapType = ExternalSpillableMap.DiskMapType.BITCASK;
boolean isCompressionEnabled = true;
// initial commit
List<HoodieLogFile> logFiles = new ArrayList<>();
Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema());
HoodieTestUtils.init(hadoopConf, basePath.toString(), HoodieTableType.MERGE_ON_READ);
String instantTime = "100";
int numberOfRecords = 100;
int numberOfLogRecords = numberOfRecords / 2;
File partitionDir = InputFormatTestUtil.prepareSimpleParquetTable(basePath, schema, 1, numberOfRecords, instantTime, HoodieTableType.MERGE_ON_READ);
HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata(Collections.emptyList(), Collections.emptyMap(), Option.empty(), WriteOperationType.UPSERT, schema.toString(), HoodieTimeline.COMMIT_ACTION);
FileCreateUtils.createCommit(basePath.toString(), instantTime, Option.of(commitMetadata));
// Add the paths
FileInputFormat.setInputPaths(baseJobConf, partitionDir.getPath());
List<Field> firstSchemaFields = schema.getFields();
// 2nd commit w/ evolved schema
Schema evolvedSchema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedCompatibleSchema());
List<Field> secondSchemaFields = evolvedSchema.getFields();
String newCommitTime = "101";
File partitionDir1 = InputFormatTestUtil.prepareSimpleParquetTable(basePath, evolvedSchema, 1, numberOfRecords, instantTime, HoodieTableType.MERGE_ON_READ, "2017", "05", "01");
HoodieCommitMetadata commitMetadata1 = CommitUtils.buildMetadata(Collections.emptyList(), Collections.emptyMap(), Option.empty(), WriteOperationType.UPSERT, evolvedSchema.toString(), HoodieTimeline.COMMIT_ACTION);
FileCreateUtils.createCommit(basePath.toString(), newCommitTime, Option.of(commitMetadata1));
// Add the paths
FileInputFormat.setInputPaths(baseJobConf, partitionDir1.getPath());
// create a split with baseFile from 1st commit.
HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(new FileSplit(new Path(partitionDir + "/fileid0_1_" + instantTime + ".parquet"), 0, 1, baseJobConf), basePath.toUri().toString(), logFiles, newCommitTime, false, Option.empty());
// create a RecordReader to be used by HoodieRealtimeRecordReader
RecordReader<NullWritable, ArrayWritable> reader = new MapredParquetInputFormat().getRecordReader(new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), baseJobConf, null);
JobConf jobConf = new JobConf(baseJobConf);
// Try to read all the fields passed by the new schema
setHiveColumnNameProps(secondSchemaFields, jobConf, true);
// This time read only the fields which are part of parquet
HoodieRealtimeRecordReader recordReader = new HoodieRealtimeRecordReader(split, jobConf, reader);
// use reader to read base Parquet File and log file
NullWritable key = recordReader.createKey();
ArrayWritable value = recordReader.createValue();
while (recordReader.next(key, value)) {
// keep reading
}
reader.close();
}
use of org.apache.hudi.common.model.HoodieLogFile in project presto by prestodb.
the class TestCustomSplitConversionUtils method testHudiRealtimeBootstrapBaseFileSplitConverter.
@Test
public void testHudiRealtimeBootstrapBaseFileSplitConverter() throws IOException {
List<String> deltaLogPaths = Arrays.asList("test1", "test2", "test3");
List<HoodieLogFile> deltaLogFiles = deltaLogPaths.stream().map(p -> new HoodieLogFile(new Path(p))).collect(Collectors.toList());
String maxCommitTime = "max_commit_time";
Path bootstrapSourceFilePath = new Path("/test/source/test.parquet");
long bootstrapSourceSplitStartPos = 0L;
long bootstrapSourceSplitLength = 200L;
FileSplit baseSplit = new FileSplit(FILE_PATH, SPLIT_START_POS, SPLIT_LENGTH, SPLIT_HOSTS);
FileSplit bootstrapSourceSplit = new FileSplit(bootstrapSourceFilePath, bootstrapSourceSplitStartPos, bootstrapSourceSplitLength, new String[0]);
FileSplit hudiSplit = new HoodieRealtimeBootstrapBaseFileSplit(baseSplit, BASE_PATH, deltaLogFiles, maxCommitTime, bootstrapSourceSplit, false, Option.empty());
// Test conversion of HudiSplit -> customSplitInfo
Map<String, String> customSplitInfo = CustomSplitConversionUtils.extractCustomSplitInfo(hudiSplit);
// Test conversion of (customSplitInfo + baseSplit) -> HudiSplit
HoodieRealtimeBootstrapBaseFileSplit recreatedSplit = (HoodieRealtimeBootstrapBaseFileSplit) CustomSplitConversionUtils.recreateSplitWithCustomInfo(baseSplit, customSplitInfo);
assertEquals(FILE_PATH, recreatedSplit.getPath());
assertEquals(SPLIT_START_POS, recreatedSplit.getStart());
assertEquals(SPLIT_LENGTH, recreatedSplit.getLength());
assertEquals(SPLIT_HOSTS, recreatedSplit.getLocations());
assertEquals(BASE_PATH, recreatedSplit.getBasePath());
assertEquals(deltaLogPaths, recreatedSplit.getDeltaLogPaths());
assertEquals(maxCommitTime, recreatedSplit.getMaxCommitTime());
assertEquals(bootstrapSourceFilePath, recreatedSplit.getBootstrapFileSplit().getPath());
assertEquals(bootstrapSourceSplitStartPos, recreatedSplit.getBootstrapFileSplit().getStart());
assertEquals(bootstrapSourceSplitLength, recreatedSplit.getBootstrapFileSplit().getLength());
}
Aggregations