use of org.apache.hudi.common.table.log.block.HoodieAvroDataBlock in project hudi by apache.
the class ArchivedCommitsCommand method showArchivedCommits.
@CliCommand(value = "show archived commit stats", help = "Read commits from archived files and show details")
public String showArchivedCommits(@CliOption(key = { "archiveFolderPattern" }, help = "Archive Folder", unspecifiedDefaultValue = "") String folder, @CliOption(key = { "limit" }, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit, @CliOption(key = { "sortBy" }, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, @CliOption(key = { "desc" }, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, @CliOption(key = { "headeronly" }, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly) throws IOException {
System.out.println("===============> Showing only " + limit + " archived commits <===============");
String basePath = HoodieCLI.getTableMetaClient().getBasePath();
Path archivePath = new Path(HoodieCLI.getTableMetaClient().getArchivePath() + "/.commits_.archive*");
if (folder != null && !folder.isEmpty()) {
archivePath = new Path(basePath + "/.hoodie/" + folder);
}
FileStatus[] fsStatuses = FSUtils.getFs(basePath, HoodieCLI.conf).globStatus(archivePath);
List<Comparable[]> allStats = new ArrayList<>();
for (FileStatus fs : fsStatuses) {
// read the archived file
Reader reader = HoodieLogFormat.newReader(FSUtils.getFs(basePath, HoodieCLI.conf), new HoodieLogFile(fs.getPath()), HoodieArchivedMetaEntry.getClassSchema());
List<IndexedRecord> readRecords = new ArrayList<>();
// read the avro blocks
while (reader.hasNext()) {
HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next();
blk.getRecordItr().forEachRemaining(readRecords::add);
}
List<Comparable[]> readCommits = readRecords.stream().map(r -> (GenericRecord) r).filter(r -> r.get("actionType").toString().equals(HoodieTimeline.COMMIT_ACTION) || r.get("actionType").toString().equals(HoodieTimeline.DELTA_COMMIT_ACTION)).flatMap(r -> {
HoodieCommitMetadata metadata = (HoodieCommitMetadata) SpecificData.get().deepCopy(HoodieCommitMetadata.SCHEMA$, r.get("hoodieCommitMetadata"));
final String instantTime = r.get("commitTime").toString();
final String action = r.get("actionType").toString();
return metadata.getPartitionToWriteStats().values().stream().flatMap(hoodieWriteStats -> hoodieWriteStats.stream().map(hoodieWriteStat -> {
List<Comparable> row = new ArrayList<>();
row.add(action);
row.add(instantTime);
row.add(hoodieWriteStat.getPartitionPath());
row.add(hoodieWriteStat.getFileId());
row.add(hoodieWriteStat.getPrevCommit());
row.add(hoodieWriteStat.getNumWrites());
row.add(hoodieWriteStat.getNumInserts());
row.add(hoodieWriteStat.getNumDeletes());
row.add(hoodieWriteStat.getNumUpdateWrites());
row.add(hoodieWriteStat.getTotalLogFiles());
row.add(hoodieWriteStat.getTotalLogBlocks());
row.add(hoodieWriteStat.getTotalCorruptLogBlock());
row.add(hoodieWriteStat.getTotalRollbackBlocks());
row.add(hoodieWriteStat.getTotalLogRecords());
row.add(hoodieWriteStat.getTotalUpdatedRecordsCompacted());
row.add(hoodieWriteStat.getTotalWriteBytes());
row.add(hoodieWriteStat.getTotalWriteErrors());
return row;
})).map(rowList -> rowList.toArray(new Comparable[0]));
}).collect(Collectors.toList());
allStats.addAll(readCommits);
reader.close();
}
TableHeader header = new TableHeader().addTableHeaderField("action").addTableHeaderField("instant").addTableHeaderField("partition").addTableHeaderField("file_id").addTableHeaderField("prev_instant").addTableHeaderField("num_writes").addTableHeaderField("num_inserts").addTableHeaderField("num_deletes").addTableHeaderField("num_update_writes").addTableHeaderField("total_log_files").addTableHeaderField("total_log_blocks").addTableHeaderField("total_corrupt_log_blocks").addTableHeaderField("total_rollback_blocks").addTableHeaderField("total_log_records").addTableHeaderField("total_updated_records_compacted").addTableHeaderField("total_write_bytes").addTableHeaderField("total_write_errors");
return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, allStats);
}
use of org.apache.hudi.common.table.log.block.HoodieAvroDataBlock in project hudi by apache.
the class TestHoodieLogFormat method testV0Format.
@Test
public void testV0Format() throws IOException, URISyntaxException {
// HoodieLogFormatVersion.DEFAULT_VERSION has been deprecated so we cannot
// create a writer for it. So these tests are only for the HoodieAvroDataBlock
// of older version.
Schema schema = getSimpleSchema();
List<IndexedRecord> records = SchemaTestUtil.generateTestRecords(0, 100);
List<IndexedRecord> recordsCopy = new ArrayList<>(records);
assertEquals(100, records.size());
assertEquals(100, recordsCopy.size());
HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, schema);
byte[] content = dataBlock.getBytes(schema);
assertTrue(content.length > 0);
HoodieLogBlock logBlock = HoodieAvroDataBlock.getBlock(content, schema);
assertEquals(HoodieLogBlockType.AVRO_DATA_BLOCK, logBlock.getBlockType());
List<IndexedRecord> readRecords = getRecords((HoodieAvroDataBlock) logBlock);
assertEquals(readRecords.size(), recordsCopy.size());
for (int i = 0; i < recordsCopy.size(); ++i) {
assertEquals(recordsCopy.get(i), readRecords.get(i));
}
// Reader schema is optional if it is same as write schema
logBlock = HoodieAvroDataBlock.getBlock(content, null);
assertEquals(HoodieLogBlockType.AVRO_DATA_BLOCK, logBlock.getBlockType());
readRecords = getRecords((HoodieAvroDataBlock) logBlock);
assertEquals(readRecords.size(), recordsCopy.size());
for (int i = 0; i < recordsCopy.size(); ++i) {
assertEquals(recordsCopy.get(i), readRecords.get(i));
}
}
use of org.apache.hudi.common.table.log.block.HoodieAvroDataBlock in project hudi by apache.
the class TestHoodieLogFormatAppendFailure method testFailedToGetAppendStreamFromHDFSNameNode.
@Test
@Timeout(60)
public void testFailedToGetAppendStreamFromHDFSNameNode() throws IOException, URISyntaxException, InterruptedException, TimeoutException {
// Use some fs like LocalFileSystem, that does not support appends
String uuid = UUID.randomUUID().toString();
Path localPartitionPath = new Path("/tmp/");
FileSystem fs = cluster.getFileSystem();
Path testPath = new Path(localPartitionPath, uuid);
fs.mkdirs(testPath);
// Some data & append.
List<IndexedRecord> records = SchemaTestUtil.generateTestRecords(0, 10);
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>(2);
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, header, HoodieRecord.RECORD_KEY_METADATA_FIELD);
Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(testPath).withFileExtension(HoodieArchivedLogFile.ARCHIVE_EXTENSION).withFileId("commits.archive").overBaseCommit("").withFs(fs).build();
writer.appendBlock(dataBlock);
// get the current log file version to compare later
int logFileVersion = writer.getLogFile().getLogVersion();
Path logFilePath = writer.getLogFile().getPath();
writer.close();
// Wait for 3 times replication of file
DFSTestUtil.waitReplication(fs, logFilePath, (short) 3);
// Shut down all DNs that have the last block location for the file
LocatedBlocks lbs = cluster.getFileSystem().getClient().getNamenode().getBlockLocations("/tmp/" + uuid + "/" + logFilePath.getName(), 0, Long.MAX_VALUE);
List<DataNode> dnsOfCluster = cluster.getDataNodes();
DatanodeInfo[] dnsWithLocations = lbs.getLastLocatedBlock().getLocations();
for (DataNode dn : dnsOfCluster) {
for (DatanodeInfo loc : dnsWithLocations) {
if (dn.getDatanodeId().equals(loc)) {
dn.shutdown();
cluster.stopDataNode(dn.getDisplayName());
DFSTestUtil.waitForDatanodeDeath(dn);
}
}
}
// Wait for the replication of this file to go down to 0
DFSTestUtil.waitReplication(fs, logFilePath, (short) 0);
// Opening a new Writer right now will throw IOException. The code should handle this, rollover the logfile and
// return a new writer with a bumped up logVersion
writer = HoodieLogFormat.newWriterBuilder().onParentPath(testPath).withFileExtension(HoodieArchivedLogFile.ARCHIVE_EXTENSION).withFileId("commits.archive").overBaseCommit("").withFs(fs).build();
header = new HashMap<>();
header.put(HoodieLogBlock.HeaderMetadataType.COMMAND_BLOCK_TYPE, String.valueOf(HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK.ordinal()));
writer.appendBlock(new HoodieCommandBlock(header));
// The log version should be different for this new writer
assertNotEquals(writer.getLogFile().getLogVersion(), logFileVersion);
}
use of org.apache.hudi.common.table.log.block.HoodieAvroDataBlock in project hudi by apache.
the class HoodieWriteableTestTable method appendRecordsToLogFile.
private Pair<String, HoodieLogFile> appendRecordsToLogFile(List<HoodieRecord> groupedRecords) throws Exception {
String partitionPath = groupedRecords.get(0).getPartitionPath();
HoodieRecordLocation location = groupedRecords.get(0).getCurrentLocation();
try (HoodieLogFormat.Writer logWriter = HoodieLogFormat.newWriterBuilder().onParentPath(new Path(basePath, partitionPath)).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId(location.getFileId()).overBaseCommit(location.getInstantTime()).withFs(fs).build()) {
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, location.getInstantTime());
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
logWriter.appendBlock(new HoodieAvroDataBlock(groupedRecords.stream().map(r -> {
try {
GenericRecord val = (GenericRecord) ((HoodieRecordPayload) r.getData()).getInsertValue(schema).get();
HoodieAvroUtils.addHoodieKeyToRecord(val, r.getRecordKey(), r.getPartitionPath(), "");
return (IndexedRecord) val;
} catch (IOException e) {
LOG.warn("Failed to convert record " + r.toString(), e);
return null;
}
}).collect(Collectors.toList()), header, HoodieRecord.RECORD_KEY_METADATA_FIELD));
return Pair.of(partitionPath, logWriter.getLogFile());
}
}
use of org.apache.hudi.common.table.log.block.HoodieAvroDataBlock in project hudi by apache.
the class HoodieLogFileReader method readBlock.
// TODO : convert content and block length to long by using ByteBuffer, raw byte [] allows
// for max of Integer size
private HoodieLogBlock readBlock() throws IOException {
int blockSize;
try {
// 1 Read the total size of the block
blockSize = (int) inputStream.readLong();
} catch (EOFException | CorruptedLogFileException e) {
// Create a corrupt block by finding the next MAGIC marker or EOF
return createCorruptBlock();
}
// We may have had a crash which could have written this block partially
// Skip blockSize in the stream and we should either find a sync marker (start of the next
// block) or EOF. If we did not find either of it, then this block is a corrupted block.
boolean isCorrupted = isBlockCorrupted(blockSize);
if (isCorrupted) {
return createCorruptBlock();
}
// 2. Read the version for this log format
HoodieLogFormat.LogFormatVersion nextBlockVersion = readVersion();
// 3. Read the block type for a log block
HoodieLogBlockType blockType = tryReadBlockType(nextBlockVersion);
// 4. Read the header for a log block, if present
Map<HeaderMetadataType, String> header = nextBlockVersion.hasHeader() ? HoodieLogBlock.getLogMetadata(inputStream) : null;
// 5. Read the content length for the content
// Fallback to full-block size if no content-length
// TODO replace w/ hasContentLength
int contentLength = nextBlockVersion.getVersion() != HoodieLogFormatVersion.DEFAULT_VERSION ? (int) inputStream.readLong() : blockSize;
// 6. Read the content or skip content based on IO vs Memory trade-off by client
long contentPosition = inputStream.getPos();
boolean shouldReadLazily = readBlockLazily && nextBlockVersion.getVersion() != HoodieLogFormatVersion.DEFAULT_VERSION;
Option<byte[]> content = HoodieLogBlock.tryReadContent(inputStream, contentLength, shouldReadLazily);
// 7. Read footer if any
Map<HeaderMetadataType, String> footer = nextBlockVersion.hasFooter() ? HoodieLogBlock.getLogMetadata(inputStream) : null;
// log file in reverse
if (nextBlockVersion.hasLogBlockLength()) {
inputStream.readLong();
}
// 9. Read the log block end position in the log file
long blockEndPos = inputStream.getPos();
HoodieLogBlock.HoodieLogBlockContentLocation logBlockContentLoc = new HoodieLogBlock.HoodieLogBlockContentLocation(hadoopConf, logFile, contentPosition, contentLength, blockEndPos);
switch(Objects.requireNonNull(blockType)) {
case AVRO_DATA_BLOCK:
if (nextBlockVersion.getVersion() == HoodieLogFormatVersion.DEFAULT_VERSION) {
return HoodieAvroDataBlock.getBlock(content.get(), readerSchema);
} else {
return new HoodieAvroDataBlock(inputStream, content, readBlockLazily, logBlockContentLoc, Option.ofNullable(readerSchema), header, footer, keyField);
}
case HFILE_DATA_BLOCK:
checkState(nextBlockVersion.getVersion() != HoodieLogFormatVersion.DEFAULT_VERSION, String.format("HFile block could not be of version (%d)", HoodieLogFormatVersion.DEFAULT_VERSION));
return new HoodieHFileDataBlock(inputStream, content, readBlockLazily, logBlockContentLoc, Option.ofNullable(readerSchema), header, footer, enableRecordLookups);
case PARQUET_DATA_BLOCK:
checkState(nextBlockVersion.getVersion() != HoodieLogFormatVersion.DEFAULT_VERSION, String.format("Parquet block could not be of version (%d)", HoodieLogFormatVersion.DEFAULT_VERSION));
return new HoodieParquetDataBlock(inputStream, content, readBlockLazily, logBlockContentLoc, Option.ofNullable(readerSchema), header, footer, keyField);
case DELETE_BLOCK:
return new HoodieDeleteBlock(content, inputStream, readBlockLazily, Option.of(logBlockContentLoc), header, footer);
case COMMAND_BLOCK:
return new HoodieCommandBlock(content, inputStream, readBlockLazily, Option.of(logBlockContentLoc), header, footer);
default:
throw new HoodieNotSupportedException("Unsupported Block " + blockType);
}
}
Aggregations