Search in sources :

Example 1 with SegmentStateProto

use of org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.SegmentStateProto in project hadoop by apache.

the class SegmentRecoveryComparator method compare.

@Override
public int compare(Entry<AsyncLogger, PrepareRecoveryResponseProto> a, Entry<AsyncLogger, PrepareRecoveryResponseProto> b) {
    PrepareRecoveryResponseProto r1 = a.getValue();
    PrepareRecoveryResponseProto r2 = b.getValue();
    // that doesn't.
    if (r1.hasSegmentState() != r2.hasSegmentState()) {
        return Booleans.compare(r1.hasSegmentState(), r2.hasSegmentState());
    }
    if (!r1.hasSegmentState()) {
        // Call them equal.
        return 0;
    }
    // They both have a segment.
    SegmentStateProto r1Seg = r1.getSegmentState();
    SegmentStateProto r2Seg = r2.getSegmentState();
    Preconditions.checkArgument(r1Seg.getStartTxId() == r2Seg.getStartTxId(), "Should only be called with responses for corresponding segments: " + "%s and %s do not have the same start txid.", r1, r2);
    // the finalized one is greater.
    if (r1Seg.getIsInProgress() != r2Seg.getIsInProgress()) {
        return Booleans.compare(!r1Seg.getIsInProgress(), !r2Seg.getIsInProgress());
    }
    if (!r1Seg.getIsInProgress()) {
        // If both are finalized, they should match lengths
        if (r1Seg.getEndTxId() != r2Seg.getEndTxId()) {
            throw new AssertionError("finalized segs with different lengths: " + r1 + ", " + r2);
        }
        return 0;
    }
    // Both are in-progress.
    long r1SeenEpoch = Math.max(r1.getAcceptedInEpoch(), r1.getLastWriterEpoch());
    long r2SeenEpoch = Math.max(r2.getAcceptedInEpoch(), r2.getLastWriterEpoch());
    return ComparisonChain.start().compare(r1SeenEpoch, r2SeenEpoch).compare(r1.getSegmentState().getEndTxId(), r2.getSegmentState().getEndTxId()).result();
}
Also used : SegmentStateProto(org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.SegmentStateProto) PrepareRecoveryResponseProto(org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.PrepareRecoveryResponseProto)

Example 2 with SegmentStateProto

use of org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.SegmentStateProto in project hadoop by apache.

the class Journal method acceptRecovery.

/**
   * @see QJournalProtocol#acceptRecovery(RequestInfo, QJournalProtocolProtos.SegmentStateProto, URL)
   */
public synchronized void acceptRecovery(RequestInfo reqInfo, SegmentStateProto segment, URL fromUrl) throws IOException {
    checkFormatted();
    checkRequest(reqInfo);
    abortCurSegment();
    long segmentTxId = segment.getStartTxId();
    // Basic sanity checks that the segment is well-formed and contains
    // at least one transaction.
    Preconditions.checkArgument(segment.getEndTxId() > 0 && segment.getEndTxId() >= segmentTxId, "bad recovery state for segment %s: %s", segmentTxId, TextFormat.shortDebugString(segment));
    PersistedRecoveryPaxosData oldData = getPersistedPaxosData(segmentTxId);
    PersistedRecoveryPaxosData newData = PersistedRecoveryPaxosData.newBuilder().setAcceptedInEpoch(reqInfo.getEpoch()).setSegmentState(segment).build();
    // checkRequest() call above should filter non-increasing epoch numbers.
    if (oldData != null) {
        alwaysAssert(oldData.getAcceptedInEpoch() <= reqInfo.getEpoch(), "Bad paxos transition, out-of-order epochs.\nOld: %s\nNew: %s\n", oldData, newData);
    }
    File syncedFile = null;
    SegmentStateProto currentSegment = getSegmentInfo(segmentTxId);
    if (currentSegment == null || currentSegment.getEndTxId() != segment.getEndTxId()) {
        if (currentSegment == null) {
            LOG.info("Synchronizing log " + TextFormat.shortDebugString(segment) + ": no current segment in place");
            // Update the highest txid for lag metrics
            updateHighestWrittenTxId(Math.max(segment.getEndTxId(), highestWrittenTxId));
        } else {
            LOG.info("Synchronizing log " + TextFormat.shortDebugString(segment) + ": old segment " + TextFormat.shortDebugString(currentSegment) + " is not the right length");
            // which are already Committed.
            if (txnRange(currentSegment).containsLong(committedTxnId.get()) && !txnRange(segment).containsLong(committedTxnId.get())) {
                throw new AssertionError("Cannot replace segment " + TextFormat.shortDebugString(currentSegment) + " with new segment " + TextFormat.shortDebugString(segment) + ": would discard already-committed txn " + committedTxnId.get());
            }
            // Another paranoid check: we should not be asked to synchronize a log
            // on top of a finalized segment.
            alwaysAssert(currentSegment.getIsInProgress(), "Should never be asked to synchronize a different log on top of an " + "already-finalized segment");
            // used for lag metrics.
            if (txnRange(currentSegment).containsLong(highestWrittenTxId)) {
                updateHighestWrittenTxId(segment.getEndTxId());
            }
        }
        syncedFile = syncLog(reqInfo, segment, fromUrl);
    } else {
        LOG.info("Skipping download of log " + TextFormat.shortDebugString(segment) + ": already have up-to-date logs");
    }
    // This is one of the few places in the protocol where we have a single
    // RPC that results in two distinct actions:
    //
    // - 1) Downloads the new log segment data (above)
    // - 2) Records the new Paxos data about the synchronized segment (below)
    //
    // These need to be treated as a transaction from the perspective
    // of any external process. We do this by treating the persistPaxosData()
    // success as the "commit" of an atomic transaction. If we fail before
    // this point, the downloaded edit log will only exist at a temporary
    // path, and thus not change any externally visible state. If we fail
    // after this point, then any future prepareRecovery() call will see
    // the Paxos data, and by calling completeHalfDoneAcceptRecovery() will
    // roll forward the rename of the referenced log file.
    //
    // See also: HDFS-3955
    //
    // The fault points here are exercised by the randomized fault injection
    // test case to ensure that this atomic "transaction" operates correctly.
    JournalFaultInjector.get().beforePersistPaxosData();
    persistPaxosData(segmentTxId, newData);
    JournalFaultInjector.get().afterPersistPaxosData();
    if (syncedFile != null) {
        FileUtil.replaceFile(syncedFile, storage.getInProgressEditLog(segmentTxId));
    }
    LOG.info("Accepted recovery for segment " + segmentTxId + ": " + TextFormat.shortDebugString(newData));
}
Also used : SegmentStateProto(org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.SegmentStateProto) PersistedRecoveryPaxosData(org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.PersistedRecoveryPaxosData) EditLogFile(org.apache.hadoop.hdfs.server.namenode.FileJournalManager.EditLogFile) PersistentLongFile(org.apache.hadoop.hdfs.util.PersistentLongFile) BestEffortLongFile(org.apache.hadoop.hdfs.util.BestEffortLongFile) File(java.io.File)

Example 3 with SegmentStateProto

use of org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.SegmentStateProto in project hadoop by apache.

the class Journal method prepareRecovery.

/**
   * @see QJournalProtocol#prepareRecovery(RequestInfo, long)
   */
public synchronized PrepareRecoveryResponseProto prepareRecovery(RequestInfo reqInfo, long segmentTxId) throws IOException {
    checkFormatted();
    checkRequest(reqInfo);
    abortCurSegment();
    PrepareRecoveryResponseProto.Builder builder = PrepareRecoveryResponseProto.newBuilder();
    PersistedRecoveryPaxosData previouslyAccepted = getPersistedPaxosData(segmentTxId);
    completeHalfDoneAcceptRecovery(previouslyAccepted);
    SegmentStateProto segInfo = getSegmentInfo(segmentTxId);
    boolean hasFinalizedSegment = segInfo != null && !segInfo.getIsInProgress();
    if (previouslyAccepted != null && !hasFinalizedSegment) {
        SegmentStateProto acceptedState = previouslyAccepted.getSegmentState();
        assert acceptedState.getEndTxId() == segInfo.getEndTxId() : "prev accepted: " + TextFormat.shortDebugString(previouslyAccepted) + "\n" + "on disk:       " + TextFormat.shortDebugString(segInfo);
        builder.setAcceptedInEpoch(previouslyAccepted.getAcceptedInEpoch()).setSegmentState(previouslyAccepted.getSegmentState());
    } else {
        if (segInfo != null) {
            builder.setSegmentState(segInfo);
        }
    }
    builder.setLastWriterEpoch(lastWriterEpoch.get());
    if (committedTxnId.get() != HdfsServerConstants.INVALID_TXID) {
        builder.setLastCommittedTxId(committedTxnId.get());
    }
    PrepareRecoveryResponseProto resp = builder.build();
    LOG.info("Prepared recovery for segment " + segmentTxId + ": " + TextFormat.shortDebugString(resp));
    return resp;
}
Also used : SegmentStateProto(org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.SegmentStateProto) PersistedRecoveryPaxosData(org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.PersistedRecoveryPaxosData) PrepareRecoveryResponseProto(org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.PrepareRecoveryResponseProto)

Example 4 with SegmentStateProto

use of org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.SegmentStateProto in project hadoop by apache.

the class Journal method getSegmentInfo.

/**
   * @return the current state of the given segment, or null if the
   * segment does not exist.
   */
@VisibleForTesting
SegmentStateProto getSegmentInfo(long segmentTxId) throws IOException {
    EditLogFile elf = fjm.getLogFile(segmentTxId);
    if (elf == null) {
        return null;
    }
    if (elf.isInProgress()) {
        elf.scanLog(Long.MAX_VALUE, false);
    }
    if (elf.getLastTxId() == HdfsServerConstants.INVALID_TXID) {
        LOG.info("Edit log file " + elf + " appears to be empty. " + "Moving it aside...");
        elf.moveAsideEmptyFile();
        return null;
    }
    SegmentStateProto ret = SegmentStateProto.newBuilder().setStartTxId(segmentTxId).setEndTxId(elf.getLastTxId()).setIsInProgress(elf.isInProgress()).build();
    LOG.info("getSegmentInfo(" + segmentTxId + "): " + elf + " -> " + TextFormat.shortDebugString(ret));
    return ret;
}
Also used : SegmentStateProto(org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.SegmentStateProto) EditLogFile(org.apache.hadoop.hdfs.server.namenode.FileJournalManager.EditLogFile) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 5 with SegmentStateProto

use of org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.SegmentStateProto in project hadoop by apache.

the class QuorumJournalManager method recoverUnclosedSegment.

/**
   * Run recovery/synchronization for a specific segment.
   * Postconditions:
   * <ul>
   * <li>This segment will be finalized on a majority
   * of nodes.</li>
   * <li>All nodes which contain the finalized segment will
   * agree on the length.</li>
   * </ul>
   * 
   * @param segmentTxId the starting txid of the segment
   * @throws IOException
   */
private void recoverUnclosedSegment(long segmentTxId) throws IOException {
    Preconditions.checkArgument(segmentTxId > 0);
    LOG.info("Beginning recovery of unclosed segment starting at txid " + segmentTxId);
    // Step 1. Prepare recovery
    QuorumCall<AsyncLogger, PrepareRecoveryResponseProto> prepare = loggers.prepareRecovery(segmentTxId);
    Map<AsyncLogger, PrepareRecoveryResponseProto> prepareResponses = loggers.waitForWriteQuorum(prepare, prepareRecoveryTimeoutMs, "prepareRecovery(" + segmentTxId + ")");
    LOG.info("Recovery prepare phase complete. Responses:\n" + QuorumCall.mapToString(prepareResponses));
    // Determine the logger who either:
    // a) Has already accepted a previous proposal that's higher than any
    //    other
    //
    //  OR, if no such logger exists:
    //
    // b) Has the longest log starting at this transaction ID
    // TODO: we should collect any "ties" and pass the URL for all of them
    // when syncing, so we can tolerate failure during recovery better.
    Entry<AsyncLogger, PrepareRecoveryResponseProto> bestEntry = Collections.max(prepareResponses.entrySet(), SegmentRecoveryComparator.INSTANCE);
    AsyncLogger bestLogger = bestEntry.getKey();
    PrepareRecoveryResponseProto bestResponse = bestEntry.getValue();
    // Log the above decision, check invariants.
    if (bestResponse.hasAcceptedInEpoch()) {
        LOG.info("Using already-accepted recovery for segment " + "starting at txid " + segmentTxId + ": " + bestEntry);
    } else if (bestResponse.hasSegmentState()) {
        LOG.info("Using longest log: " + bestEntry);
    } else {
        // but a bug in the comparator might cause us to get here.
        for (PrepareRecoveryResponseProto resp : prepareResponses.values()) {
            assert !resp.hasSegmentState() : "One of the loggers had a response, but no best logger " + "was found.";
        }
        LOG.info("None of the responders had a log to recover: " + QuorumCall.mapToString(prepareResponses));
        return;
    }
    SegmentStateProto logToSync = bestResponse.getSegmentState();
    assert segmentTxId == logToSync.getStartTxId();
    // txid than the txid we intend to truncate to
    for (Map.Entry<AsyncLogger, PrepareRecoveryResponseProto> e : prepareResponses.entrySet()) {
        AsyncLogger logger = e.getKey();
        PrepareRecoveryResponseProto resp = e.getValue();
        if (resp.hasLastCommittedTxId() && resp.getLastCommittedTxId() > logToSync.getEndTxId()) {
            throw new AssertionError("Decided to synchronize log to " + logToSync + " but logger " + logger + " had seen txid " + resp.getLastCommittedTxId() + " committed");
        }
    }
    URL syncFromUrl = bestLogger.buildURLToFetchLogs(segmentTxId);
    QuorumCall<AsyncLogger, Void> accept = loggers.acceptRecovery(logToSync, syncFromUrl);
    loggers.waitForWriteQuorum(accept, acceptRecoveryTimeoutMs, "acceptRecovery(" + TextFormat.shortDebugString(logToSync) + ")");
    // If one of the loggers above missed the synchronization step above, but
    // we send a finalize() here, that's OK. It validates the log before
    // finalizing. Hence, even if it is not "in sync", it won't incorrectly
    // finalize.
    QuorumCall<AsyncLogger, Void> finalize = loggers.finalizeLogSegment(logToSync.getStartTxId(), logToSync.getEndTxId());
    loggers.waitForWriteQuorum(finalize, finalizeSegmentTimeoutMs, String.format("finalizeLogSegment(%s-%s)", logToSync.getStartTxId(), logToSync.getEndTxId()));
}
Also used : SegmentStateProto(org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.SegmentStateProto) PrepareRecoveryResponseProto(org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.PrepareRecoveryResponseProto) Map(java.util.Map) URL(java.net.URL)

Aggregations

SegmentStateProto (org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.SegmentStateProto)6 PrepareRecoveryResponseProto (org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.PrepareRecoveryResponseProto)3 PersistedRecoveryPaxosData (org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.PersistedRecoveryPaxosData)2 EditLogFile (org.apache.hadoop.hdfs.server.namenode.FileJournalManager.EditLogFile)2 VisibleForTesting (com.google.common.annotations.VisibleForTesting)1 File (java.io.File)1 URL (java.net.URL)1 Map (java.util.Map)1 BestEffortLongFile (org.apache.hadoop.hdfs.util.BestEffortLongFile)1 PersistentLongFile (org.apache.hadoop.hdfs.util.PersistentLongFile)1 Test (org.junit.Test)1