use of org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.SegmentStateProto in project hadoop by apache.
the class SegmentRecoveryComparator method compare.
@Override
public int compare(Entry<AsyncLogger, PrepareRecoveryResponseProto> a, Entry<AsyncLogger, PrepareRecoveryResponseProto> b) {
PrepareRecoveryResponseProto r1 = a.getValue();
PrepareRecoveryResponseProto r2 = b.getValue();
// that doesn't.
if (r1.hasSegmentState() != r2.hasSegmentState()) {
return Booleans.compare(r1.hasSegmentState(), r2.hasSegmentState());
}
if (!r1.hasSegmentState()) {
// Call them equal.
return 0;
}
// They both have a segment.
SegmentStateProto r1Seg = r1.getSegmentState();
SegmentStateProto r2Seg = r2.getSegmentState();
Preconditions.checkArgument(r1Seg.getStartTxId() == r2Seg.getStartTxId(), "Should only be called with responses for corresponding segments: " + "%s and %s do not have the same start txid.", r1, r2);
// the finalized one is greater.
if (r1Seg.getIsInProgress() != r2Seg.getIsInProgress()) {
return Booleans.compare(!r1Seg.getIsInProgress(), !r2Seg.getIsInProgress());
}
if (!r1Seg.getIsInProgress()) {
// If both are finalized, they should match lengths
if (r1Seg.getEndTxId() != r2Seg.getEndTxId()) {
throw new AssertionError("finalized segs with different lengths: " + r1 + ", " + r2);
}
return 0;
}
// Both are in-progress.
long r1SeenEpoch = Math.max(r1.getAcceptedInEpoch(), r1.getLastWriterEpoch());
long r2SeenEpoch = Math.max(r2.getAcceptedInEpoch(), r2.getLastWriterEpoch());
return ComparisonChain.start().compare(r1SeenEpoch, r2SeenEpoch).compare(r1.getSegmentState().getEndTxId(), r2.getSegmentState().getEndTxId()).result();
}
use of org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.SegmentStateProto in project hadoop by apache.
the class Journal method acceptRecovery.
/**
* @see QJournalProtocol#acceptRecovery(RequestInfo, QJournalProtocolProtos.SegmentStateProto, URL)
*/
public synchronized void acceptRecovery(RequestInfo reqInfo, SegmentStateProto segment, URL fromUrl) throws IOException {
checkFormatted();
checkRequest(reqInfo);
abortCurSegment();
long segmentTxId = segment.getStartTxId();
// Basic sanity checks that the segment is well-formed and contains
// at least one transaction.
Preconditions.checkArgument(segment.getEndTxId() > 0 && segment.getEndTxId() >= segmentTxId, "bad recovery state for segment %s: %s", segmentTxId, TextFormat.shortDebugString(segment));
PersistedRecoveryPaxosData oldData = getPersistedPaxosData(segmentTxId);
PersistedRecoveryPaxosData newData = PersistedRecoveryPaxosData.newBuilder().setAcceptedInEpoch(reqInfo.getEpoch()).setSegmentState(segment).build();
// checkRequest() call above should filter non-increasing epoch numbers.
if (oldData != null) {
alwaysAssert(oldData.getAcceptedInEpoch() <= reqInfo.getEpoch(), "Bad paxos transition, out-of-order epochs.\nOld: %s\nNew: %s\n", oldData, newData);
}
File syncedFile = null;
SegmentStateProto currentSegment = getSegmentInfo(segmentTxId);
if (currentSegment == null || currentSegment.getEndTxId() != segment.getEndTxId()) {
if (currentSegment == null) {
LOG.info("Synchronizing log " + TextFormat.shortDebugString(segment) + ": no current segment in place");
// Update the highest txid for lag metrics
updateHighestWrittenTxId(Math.max(segment.getEndTxId(), highestWrittenTxId));
} else {
LOG.info("Synchronizing log " + TextFormat.shortDebugString(segment) + ": old segment " + TextFormat.shortDebugString(currentSegment) + " is not the right length");
// which are already Committed.
if (txnRange(currentSegment).containsLong(committedTxnId.get()) && !txnRange(segment).containsLong(committedTxnId.get())) {
throw new AssertionError("Cannot replace segment " + TextFormat.shortDebugString(currentSegment) + " with new segment " + TextFormat.shortDebugString(segment) + ": would discard already-committed txn " + committedTxnId.get());
}
// Another paranoid check: we should not be asked to synchronize a log
// on top of a finalized segment.
alwaysAssert(currentSegment.getIsInProgress(), "Should never be asked to synchronize a different log on top of an " + "already-finalized segment");
// used for lag metrics.
if (txnRange(currentSegment).containsLong(highestWrittenTxId)) {
updateHighestWrittenTxId(segment.getEndTxId());
}
}
syncedFile = syncLog(reqInfo, segment, fromUrl);
} else {
LOG.info("Skipping download of log " + TextFormat.shortDebugString(segment) + ": already have up-to-date logs");
}
// This is one of the few places in the protocol where we have a single
// RPC that results in two distinct actions:
//
// - 1) Downloads the new log segment data (above)
// - 2) Records the new Paxos data about the synchronized segment (below)
//
// These need to be treated as a transaction from the perspective
// of any external process. We do this by treating the persistPaxosData()
// success as the "commit" of an atomic transaction. If we fail before
// this point, the downloaded edit log will only exist at a temporary
// path, and thus not change any externally visible state. If we fail
// after this point, then any future prepareRecovery() call will see
// the Paxos data, and by calling completeHalfDoneAcceptRecovery() will
// roll forward the rename of the referenced log file.
//
// See also: HDFS-3955
//
// The fault points here are exercised by the randomized fault injection
// test case to ensure that this atomic "transaction" operates correctly.
JournalFaultInjector.get().beforePersistPaxosData();
persistPaxosData(segmentTxId, newData);
JournalFaultInjector.get().afterPersistPaxosData();
if (syncedFile != null) {
FileUtil.replaceFile(syncedFile, storage.getInProgressEditLog(segmentTxId));
}
LOG.info("Accepted recovery for segment " + segmentTxId + ": " + TextFormat.shortDebugString(newData));
}
use of org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.SegmentStateProto in project hadoop by apache.
the class Journal method prepareRecovery.
/**
* @see QJournalProtocol#prepareRecovery(RequestInfo, long)
*/
public synchronized PrepareRecoveryResponseProto prepareRecovery(RequestInfo reqInfo, long segmentTxId) throws IOException {
checkFormatted();
checkRequest(reqInfo);
abortCurSegment();
PrepareRecoveryResponseProto.Builder builder = PrepareRecoveryResponseProto.newBuilder();
PersistedRecoveryPaxosData previouslyAccepted = getPersistedPaxosData(segmentTxId);
completeHalfDoneAcceptRecovery(previouslyAccepted);
SegmentStateProto segInfo = getSegmentInfo(segmentTxId);
boolean hasFinalizedSegment = segInfo != null && !segInfo.getIsInProgress();
if (previouslyAccepted != null && !hasFinalizedSegment) {
SegmentStateProto acceptedState = previouslyAccepted.getSegmentState();
assert acceptedState.getEndTxId() == segInfo.getEndTxId() : "prev accepted: " + TextFormat.shortDebugString(previouslyAccepted) + "\n" + "on disk: " + TextFormat.shortDebugString(segInfo);
builder.setAcceptedInEpoch(previouslyAccepted.getAcceptedInEpoch()).setSegmentState(previouslyAccepted.getSegmentState());
} else {
if (segInfo != null) {
builder.setSegmentState(segInfo);
}
}
builder.setLastWriterEpoch(lastWriterEpoch.get());
if (committedTxnId.get() != HdfsServerConstants.INVALID_TXID) {
builder.setLastCommittedTxId(committedTxnId.get());
}
PrepareRecoveryResponseProto resp = builder.build();
LOG.info("Prepared recovery for segment " + segmentTxId + ": " + TextFormat.shortDebugString(resp));
return resp;
}
use of org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.SegmentStateProto in project hadoop by apache.
the class Journal method getSegmentInfo.
/**
* @return the current state of the given segment, or null if the
* segment does not exist.
*/
@VisibleForTesting
SegmentStateProto getSegmentInfo(long segmentTxId) throws IOException {
EditLogFile elf = fjm.getLogFile(segmentTxId);
if (elf == null) {
return null;
}
if (elf.isInProgress()) {
elf.scanLog(Long.MAX_VALUE, false);
}
if (elf.getLastTxId() == HdfsServerConstants.INVALID_TXID) {
LOG.info("Edit log file " + elf + " appears to be empty. " + "Moving it aside...");
elf.moveAsideEmptyFile();
return null;
}
SegmentStateProto ret = SegmentStateProto.newBuilder().setStartTxId(segmentTxId).setEndTxId(elf.getLastTxId()).setIsInProgress(elf.isInProgress()).build();
LOG.info("getSegmentInfo(" + segmentTxId + "): " + elf + " -> " + TextFormat.shortDebugString(ret));
return ret;
}
use of org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.SegmentStateProto in project hadoop by apache.
the class QuorumJournalManager method recoverUnclosedSegment.
/**
* Run recovery/synchronization for a specific segment.
* Postconditions:
* <ul>
* <li>This segment will be finalized on a majority
* of nodes.</li>
* <li>All nodes which contain the finalized segment will
* agree on the length.</li>
* </ul>
*
* @param segmentTxId the starting txid of the segment
* @throws IOException
*/
private void recoverUnclosedSegment(long segmentTxId) throws IOException {
Preconditions.checkArgument(segmentTxId > 0);
LOG.info("Beginning recovery of unclosed segment starting at txid " + segmentTxId);
// Step 1. Prepare recovery
QuorumCall<AsyncLogger, PrepareRecoveryResponseProto> prepare = loggers.prepareRecovery(segmentTxId);
Map<AsyncLogger, PrepareRecoveryResponseProto> prepareResponses = loggers.waitForWriteQuorum(prepare, prepareRecoveryTimeoutMs, "prepareRecovery(" + segmentTxId + ")");
LOG.info("Recovery prepare phase complete. Responses:\n" + QuorumCall.mapToString(prepareResponses));
// Determine the logger who either:
// a) Has already accepted a previous proposal that's higher than any
// other
//
// OR, if no such logger exists:
//
// b) Has the longest log starting at this transaction ID
// TODO: we should collect any "ties" and pass the URL for all of them
// when syncing, so we can tolerate failure during recovery better.
Entry<AsyncLogger, PrepareRecoveryResponseProto> bestEntry = Collections.max(prepareResponses.entrySet(), SegmentRecoveryComparator.INSTANCE);
AsyncLogger bestLogger = bestEntry.getKey();
PrepareRecoveryResponseProto bestResponse = bestEntry.getValue();
// Log the above decision, check invariants.
if (bestResponse.hasAcceptedInEpoch()) {
LOG.info("Using already-accepted recovery for segment " + "starting at txid " + segmentTxId + ": " + bestEntry);
} else if (bestResponse.hasSegmentState()) {
LOG.info("Using longest log: " + bestEntry);
} else {
// but a bug in the comparator might cause us to get here.
for (PrepareRecoveryResponseProto resp : prepareResponses.values()) {
assert !resp.hasSegmentState() : "One of the loggers had a response, but no best logger " + "was found.";
}
LOG.info("None of the responders had a log to recover: " + QuorumCall.mapToString(prepareResponses));
return;
}
SegmentStateProto logToSync = bestResponse.getSegmentState();
assert segmentTxId == logToSync.getStartTxId();
// txid than the txid we intend to truncate to
for (Map.Entry<AsyncLogger, PrepareRecoveryResponseProto> e : prepareResponses.entrySet()) {
AsyncLogger logger = e.getKey();
PrepareRecoveryResponseProto resp = e.getValue();
if (resp.hasLastCommittedTxId() && resp.getLastCommittedTxId() > logToSync.getEndTxId()) {
throw new AssertionError("Decided to synchronize log to " + logToSync + " but logger " + logger + " had seen txid " + resp.getLastCommittedTxId() + " committed");
}
}
URL syncFromUrl = bestLogger.buildURLToFetchLogs(segmentTxId);
QuorumCall<AsyncLogger, Void> accept = loggers.acceptRecovery(logToSync, syncFromUrl);
loggers.waitForWriteQuorum(accept, acceptRecoveryTimeoutMs, "acceptRecovery(" + TextFormat.shortDebugString(logToSync) + ")");
// If one of the loggers above missed the synchronization step above, but
// we send a finalize() here, that's OK. It validates the log before
// finalizing. Hence, even if it is not "in sync", it won't incorrectly
// finalize.
QuorumCall<AsyncLogger, Void> finalize = loggers.finalizeLogSegment(logToSync.getStartTxId(), logToSync.getEndTxId());
loggers.waitForWriteQuorum(finalize, finalizeSegmentTimeoutMs, String.format("finalizeLogSegment(%s-%s)", logToSync.getStartTxId(), logToSync.getEndTxId()));
}
Aggregations