use of org.apache.hudi.exception.HoodieIOException in project hudi by apache.
the class HoodieMergeOnReadTableInputFormat method createRealtimeFileStatusUnchecked.
/**
* Creates {@link RealtimeFileStatus} for the file-slice where base file is NOT present
*/
private static RealtimeFileStatus createRealtimeFileStatusUnchecked(HoodieLogFile latestLogFile, Stream<HoodieLogFile> logFiles, String basePath, Option<HoodieInstant> latestCompletedInstantOpt, Option<HoodieVirtualKeyInfo> virtualKeyInfoOpt) {
List<HoodieLogFile> sortedLogFiles = logFiles.sorted(HoodieLogFile.getLogFileComparator()).collect(Collectors.toList());
try {
RealtimeFileStatus rtFileStatus = new RealtimeFileStatus(latestLogFile.getFileStatus(), basePath, sortedLogFiles, false, virtualKeyInfoOpt);
if (latestCompletedInstantOpt.isPresent()) {
HoodieInstant latestCompletedInstant = latestCompletedInstantOpt.get();
checkState(latestCompletedInstant.isCompleted());
rtFileStatus.setMaxCommitTime(latestCompletedInstant.getTimestamp());
}
return rtFileStatus;
} catch (IOException e) {
throw new HoodieIOException(String.format("Failed to init %s", RealtimeFileStatus.class.getSimpleName()), e);
}
}
use of org.apache.hudi.exception.HoodieIOException in project hudi by apache.
the class RepairsCommand method removeCorruptedPendingCleanAction.
@CliCommand(value = "repair corrupted clean files", help = "repair corrupted clean files")
public void removeCorruptedPendingCleanAction() {
HoodieTableMetaClient client = HoodieCLI.getTableMetaClient();
HoodieTimeline cleanerTimeline = HoodieCLI.getTableMetaClient().getActiveTimeline().getCleanerTimeline();
LOG.info("Inspecting pending clean metadata in timeline for corrupted files");
cleanerTimeline.filterInflightsAndRequested().getInstants().forEach(instant -> {
try {
CleanerUtils.getCleanerPlan(client, instant);
} catch (AvroRuntimeException e) {
LOG.warn("Corruption found. Trying to remove corrupted clean instant file: " + instant);
HoodieActiveTimeline.deleteInstantFile(client.getFs(), client.getMetaPath(), instant);
} catch (IOException ioe) {
if (ioe.getMessage().contains("Not an Avro data file")) {
LOG.warn("Corruption found. Trying to remove corrupted clean instant file: " + instant);
HoodieActiveTimeline.deleteInstantFile(client.getFs(), client.getMetaPath(), instant);
} else {
throw new HoodieIOException(ioe.getMessage(), ioe);
}
}
});
}
use of org.apache.hudi.exception.HoodieIOException in project hudi by apache.
the class AbstractHoodieLogRecordReader method scan.
public synchronized void scan(Option<List<String>> keys) {
currentInstantLogBlocks = new ArrayDeque<>();
progress = 0.0f;
totalLogFiles = new AtomicLong(0);
totalRollbacks = new AtomicLong(0);
totalCorruptBlocks = new AtomicLong(0);
totalLogBlocks = new AtomicLong(0);
totalLogRecords = new AtomicLong(0);
HoodieLogFormatReader logFormatReaderWrapper = null;
HoodieTimeline commitsTimeline = this.hoodieTableMetaClient.getCommitsTimeline();
HoodieTimeline completedInstantsTimeline = commitsTimeline.filterCompletedInstants();
HoodieTimeline inflightInstantsTimeline = commitsTimeline.filterInflights();
try {
// Get the key field based on populate meta fields config
// and the table type
final String keyField = getKeyField();
// Iterate over the paths
logFormatReaderWrapper = new HoodieLogFormatReader(fs, logFilePaths.stream().map(logFile -> new HoodieLogFile(new Path(logFile))).collect(Collectors.toList()), readerSchema, readBlocksLazily, reverseReader, bufferSize, !enableFullScan, keyField);
Set<HoodieLogFile> scannedLogFiles = new HashSet<>();
while (logFormatReaderWrapper.hasNext()) {
HoodieLogFile logFile = logFormatReaderWrapper.getLogFile();
LOG.info("Scanning log file " + logFile);
scannedLogFiles.add(logFile);
totalLogFiles.set(scannedLogFiles.size());
// Use the HoodieLogFileReader to iterate through the blocks in the log file
HoodieLogBlock logBlock = logFormatReaderWrapper.next();
final String instantTime = logBlock.getLogBlockHeader().get(INSTANT_TIME);
totalLogBlocks.incrementAndGet();
if (logBlock.getBlockType() != CORRUPT_BLOCK && !HoodieTimeline.compareTimestamps(logBlock.getLogBlockHeader().get(INSTANT_TIME), HoodieTimeline.LESSER_THAN_OR_EQUALS, this.latestInstantTime)) {
// hit a block with instant time greater than should be processed, stop processing further
break;
}
if (logBlock.getBlockType() != CORRUPT_BLOCK && logBlock.getBlockType() != COMMAND_BLOCK) {
if (!completedInstantsTimeline.containsOrBeforeTimelineStarts(instantTime) || inflightInstantsTimeline.containsInstant(instantTime)) {
// hit an uncommitted block possibly from a failed write, move to the next one and skip processing this one
continue;
}
if (instantRange.isPresent() && !instantRange.get().isInRange(instantTime)) {
// filter the log block by instant range
continue;
}
}
switch(logBlock.getBlockType()) {
case HFILE_DATA_BLOCK:
case AVRO_DATA_BLOCK:
case PARQUET_DATA_BLOCK:
LOG.info("Reading a data block from file " + logFile.getPath() + " at instant " + logBlock.getLogBlockHeader().get(INSTANT_TIME));
if (isNewInstantBlock(logBlock) && !readBlocksLazily) {
// If this is an avro data block belonging to a different commit/instant,
// then merge the last blocks and records into the main result
processQueuedBlocksForInstant(currentInstantLogBlocks, scannedLogFiles.size(), keys);
}
// store the current block
currentInstantLogBlocks.push(logBlock);
break;
case DELETE_BLOCK:
LOG.info("Reading a delete block from file " + logFile.getPath());
if (isNewInstantBlock(logBlock) && !readBlocksLazily) {
// If this is a delete data block belonging to a different commit/instant,
// then merge the last blocks and records into the main result
processQueuedBlocksForInstant(currentInstantLogBlocks, scannedLogFiles.size(), keys);
}
// store deletes so can be rolled back
currentInstantLogBlocks.push(logBlock);
break;
case COMMAND_BLOCK:
// Consider the following scenario
// (Time 0, C1, Task T1) -> Running
// (Time 1, C1, Task T1) -> Failed (Wrote either a corrupt block or a correct
// DataBlock (B1) with commitTime C1
// (Time 2, C1, Task T1.2) -> Running (Task T1 was retried and the attempt number is 2)
// (Time 3, C1, Task T1.2) -> Finished (Wrote a correct DataBlock B2)
// Now a logFile L1 can have 2 correct Datablocks (B1 and B2) which are the same.
// Say, commit C1 eventually failed and a rollback is triggered.
// Rollback will write only 1 rollback block (R1) since it assumes one block is
// written per ingestion batch for a file but in reality we need to rollback (B1 & B2)
// The following code ensures the same rollback block (R1) is used to rollback
// both B1 & B2
LOG.info("Reading a command block from file " + logFile.getPath());
// This is a command block - take appropriate action based on the command
HoodieCommandBlock commandBlock = (HoodieCommandBlock) logBlock;
String targetInstantForCommandBlock = logBlock.getLogBlockHeader().get(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME);
switch(// there can be different types of command blocks
commandBlock.getType()) {
case ROLLBACK_PREVIOUS_BLOCK:
// Rollback the last read log block
// Get commit time from last record block, compare with targetCommitTime,
// rollback only if equal, this is required in scenarios of invalid/extra
// rollback blocks written due to failures during the rollback operation itself
// and ensures the same rollback block (R1) is used to rollback both B1 & B2 with
// same instant_time
int numBlocksRolledBack = 0;
totalRollbacks.incrementAndGet();
while (!currentInstantLogBlocks.isEmpty()) {
HoodieLogBlock lastBlock = currentInstantLogBlocks.peek();
// handle corrupt blocks separately since they may not have metadata
if (lastBlock.getBlockType() == CORRUPT_BLOCK) {
LOG.info("Rolling back the last corrupted log block read in " + logFile.getPath());
currentInstantLogBlocks.pop();
numBlocksRolledBack++;
} else if (targetInstantForCommandBlock.contentEquals(lastBlock.getLogBlockHeader().get(INSTANT_TIME))) {
// rollback last data block or delete block
LOG.info("Rolling back the last log block read in " + logFile.getPath());
currentInstantLogBlocks.pop();
numBlocksRolledBack++;
} else if (!targetInstantForCommandBlock.contentEquals(currentInstantLogBlocks.peek().getLogBlockHeader().get(INSTANT_TIME))) {
// invalid or extra rollback block
LOG.warn("TargetInstantTime " + targetInstantForCommandBlock + " invalid or extra rollback command block in " + logFile.getPath());
break;
} else {
// this should not happen ideally
LOG.warn("Unable to apply rollback command block in " + logFile.getPath());
}
}
LOG.info("Number of applied rollback blocks " + numBlocksRolledBack);
break;
default:
throw new UnsupportedOperationException("Command type not yet supported.");
}
break;
case CORRUPT_BLOCK:
LOG.info("Found a corrupt block in " + logFile.getPath());
totalCorruptBlocks.incrementAndGet();
// If there is a corrupt block - we will assume that this was the next data block
currentInstantLogBlocks.push(logBlock);
break;
default:
throw new UnsupportedOperationException("Block type not supported yet");
}
}
// merge the last read block when all the blocks are done reading
if (!currentInstantLogBlocks.isEmpty()) {
LOG.info("Merging the final data blocks");
processQueuedBlocksForInstant(currentInstantLogBlocks, scannedLogFiles.size(), keys);
}
// Done
progress = 1.0f;
} catch (IOException e) {
LOG.error("Got IOException when reading log file", e);
throw new HoodieIOException("IOException when reading log file ", e);
} catch (Exception e) {
LOG.error("Got exception when reading log file", e);
throw new HoodieException("Exception when reading log file ", e);
} finally {
try {
if (null != logFormatReaderWrapper) {
logFormatReaderWrapper.close();
}
} catch (IOException ioe) {
// Eat exception as we do not want to mask the original exception that can happen
LOG.error("Unable to close log format reader", ioe);
}
}
}
use of org.apache.hudi.exception.HoodieIOException in project hudi by apache.
the class HoodieLogFormatReader method hasNext.
@Override
public boolean hasNext() {
if (currentReader == null) {
return false;
} else if (currentReader.hasNext()) {
return true;
} else if (logFiles.size() > 0) {
try {
HoodieLogFile nextLogFile = logFiles.remove(0);
// First close previous reader only if readBlockLazily is true
if (!readBlocksLazily) {
this.currentReader.close();
} else {
this.prevReadersInOpenState.add(currentReader);
}
this.currentReader = new HoodieLogFileReader(fs, nextLogFile, readerSchema, bufferSize, readBlocksLazily, false, enableInlineReading, recordKeyField);
} catch (IOException io) {
throw new HoodieIOException("unable to initialize read with log file ", io);
}
LOG.info("Moving to the next reader for logfile " + currentReader.getLogFile());
return hasNext();
}
return false;
}
use of org.apache.hudi.exception.HoodieIOException in project hudi by apache.
the class HoodieLogFormatWriter method appendBlocks.
@Override
public AppendResult appendBlocks(List<HoodieLogBlock> blocks) throws IOException, InterruptedException {
// Find current version
HoodieLogFormat.LogFormatVersion currentLogFormatVersion = new HoodieLogFormatVersion(HoodieLogFormat.CURRENT_VERSION);
FSDataOutputStream originalOutputStream = getOutputStream();
long startPos = originalOutputStream.getPos();
long sizeWritten = 0;
// HUDI-2655. here we wrap originalOutputStream to ensure huge blocks can be correctly written
FSDataOutputStream outputStream = new FSDataOutputStream(originalOutputStream, new FileSystem.Statistics(fs.getScheme()), startPos);
for (HoodieLogBlock block : blocks) {
long startSize = outputStream.size();
// 1. Write the magic header for the start of the block
outputStream.write(HoodieLogFormat.MAGIC);
// bytes for header
byte[] headerBytes = HoodieLogBlock.getLogMetadataBytes(block.getLogBlockHeader());
// content bytes
byte[] content = block.getContentBytes();
// bytes for footer
byte[] footerBytes = HoodieLogBlock.getLogMetadataBytes(block.getLogBlockFooter());
// 2. Write the total size of the block (excluding Magic)
outputStream.writeLong(getLogBlockLength(content.length, headerBytes.length, footerBytes.length));
// 3. Write the version of this log block
outputStream.writeInt(currentLogFormatVersion.getVersion());
// 4. Write the block type
outputStream.writeInt(block.getBlockType().ordinal());
// 5. Write the headers for the log block
outputStream.write(headerBytes);
// 6. Write the size of the content block
outputStream.writeLong(content.length);
// 7. Write the contents of the data block
outputStream.write(content);
// 8. Write the footers for the log block
outputStream.write(footerBytes);
// 9. Write the total size of the log block (including magic) which is everything written
// until now (for reverse pointer)
// Update: this information is now used in determining if a block is corrupt by comparing to the
// block size in header. This change assumes that the block size will be the last data written
// to a block. Read will break if any data is written past this point for a block.
outputStream.writeLong(outputStream.size() - startSize);
// HUDI-2655. Check the size written to avoid log blocks whose size overflow.
if (outputStream.size() == Integer.MAX_VALUE) {
throw new HoodieIOException("Blocks appended may overflow. Please decrease log block size or log block amount");
}
sizeWritten += outputStream.size() - startSize;
}
// Flush all blocks to disk
flush();
AppendResult result = new AppendResult(logFile, startPos, sizeWritten);
// roll over if size is past the threshold
rolloverIfNeeded();
return result;
}
Aggregations