Search in sources :

Example 36 with FailureContext

use of org.apache.ignite.failure.FailureContext in project ignite by apache.

the class GridCacheDatabaseSharedManager method restoreBinaryMemory.

/**
 * @param cacheGroupsPredicate Cache groups to restore.
 * @param recordTypePredicate Filter records by type.
 * @return Last seen WAL pointer during binary memory recovery.
 * @throws IgniteCheckedException If failed.
 */
private RestoreBinaryState restoreBinaryMemory(IgnitePredicate<Integer> cacheGroupsPredicate, IgniteBiPredicate<WALRecord.RecordType, WALPointer> recordTypePredicate) throws IgniteCheckedException {
    long time = System.currentTimeMillis();
    try {
        if (log.isInfoEnabled())
            log.info("Starting binary memory restore for: " + cctx.cache().cacheGroupDescriptors().keySet());
        for (DatabaseLifecycleListener lsnr : getDatabaseListeners(cctx.kernalContext())) lsnr.beforeBinaryMemoryRestore(this);
        CheckpointStatus status = readCheckpointStatus();
        // First, bring memory to the last consistent checkpoint state if needed.
        // This method should return a pointer to the last valid record in the WAL.
        RestoreBinaryState binaryState = performBinaryMemoryRestore(status, cacheGroupsPredicate, recordTypePredicate, true);
        WALPointer restored = binaryState.lastReadRecordPointer();
        if (restored.equals(CheckpointStatus.NULL_PTR))
            // This record is first
            restored = null;
        else
            restored = restored.next();
        if (restored == null && !status.endPtr.equals(CheckpointStatus.NULL_PTR)) {
            throw new StorageException("The memory cannot be restored. The critical part of WAL archive is missing " + "[tailWalPtr=" + restored + ", endPtr=" + status.endPtr + ']');
        } else if (restored != null)
            U.log(log, "Binary memory state restored at node startup [restoredPtr=" + restored + ']');
        // Wal logging is now available.
        cctx.wal().resumeLogging(restored);
        // Log MemoryRecoveryRecord to make sure that old physical records are not replayed during
        // next physical recovery.
        checkpointManager.memoryRecoveryRecordPtr(cctx.wal().log(new MemoryRecoveryRecord(U.currentTimeMillis())));
        for (DatabaseLifecycleListener lsnr : getDatabaseListeners(cctx.kernalContext())) lsnr.afterBinaryMemoryRestore(this, binaryState);
        if (log.isInfoEnabled())
            log.info("Binary recovery performed in " + (System.currentTimeMillis() - time) + " ms.");
        return binaryState;
    } catch (IgniteCheckedException e) {
        if (X.hasCause(e, StorageException.class, IOException.class))
            cctx.kernalContext().failure().process(new FailureContext(FailureType.CRITICAL_ERROR, e));
        throw e;
    }
}
Also used : CheckpointStatus(org.apache.ignite.internal.processors.cache.persistence.checkpoint.CheckpointStatus) IgniteCheckedException(org.apache.ignite.IgniteCheckedException) FailureContext(org.apache.ignite.failure.FailureContext) IOException(java.io.IOException) WALPointer(org.apache.ignite.internal.processors.cache.persistence.wal.WALPointer) MemoryRecoveryRecord(org.apache.ignite.internal.pagemem.wal.record.MemoryRecoveryRecord)

Example 37 with FailureContext

use of org.apache.ignite.failure.FailureContext in project ignite by apache.

the class Checkpointer method body.

/**
 * {@inheritDoc}
 */
@Override
protected void body() {
    Throwable err = null;
    try {
        while (!isCancelled()) {
            waitCheckpointEvent();
            if (skipCheckpointOnNodeStop && (isCancelled() || shutdownNow)) {
                if (log.isInfoEnabled())
                    log.warning("Skipping last checkpoint because node is stopping.");
                return;
            }
            GridFutureAdapter<Void> enableChangeApplied = this.enableChangeApplied;
            if (enableChangeApplied != null) {
                enableChangeApplied.onDone();
                this.enableChangeApplied = null;
            }
            if (checkpointsEnabled)
                doCheckpoint();
            else {
                synchronized (this) {
                    scheduledCp.nextCpNanos(System.nanoTime() + U.millisToNanos(nextCheckpointInterval()));
                }
            }
        }
        // Final run after the cancellation.
        if (checkpointsEnabled && !shutdownNow)
            doCheckpoint();
    } catch (Throwable t) {
        err = t;
        scheduledCp.fail(t);
        throw t;
    } finally {
        if (err == null && !(isCancelled))
            err = new IllegalStateException("Thread is terminated unexpectedly: " + name());
        if (err instanceof OutOfMemoryError)
            failureProcessor.process(new FailureContext(CRITICAL_ERROR, err));
        else if (err != null)
            failureProcessor.process(new FailureContext(SYSTEM_WORKER_TERMINATION, err));
        scheduledCp.fail(new NodeStoppingException("Node is stopping."));
    }
}
Also used : NodeStoppingException(org.apache.ignite.internal.NodeStoppingException) FailureContext(org.apache.ignite.failure.FailureContext)

Example 38 with FailureContext

use of org.apache.ignite.failure.FailureContext in project ignite by apache.

the class Checkpointer method doCheckpoint.

/**
 */
private void doCheckpoint() {
    Checkpoint chp = null;
    try {
        CheckpointMetricsTracker tracker = new CheckpointMetricsTracker();
        startCheckpointProgress();
        try {
            chp = checkpointWorkflow.markCheckpointBegin(lastCpTs, curCpProgress, tracker, this);
        } catch (Exception e) {
            if (curCpProgress != null)
                curCpProgress.fail(e);
            // In case of checkpoint initialization error node should be invalidated and stopped.
            failureProcessor.process(new FailureContext(FailureType.CRITICAL_ERROR, e));
            // Re-throw as unchecked exception to force stopping checkpoint thread.
            throw new IgniteException(e);
        }
        updateHeartbeat();
        currentProgress().initCounters(chp.pagesSize);
        if (chp.hasDelta()) {
            if (log.isInfoEnabled()) {
                long possibleJvmPauseDur = possibleLongJvmPauseDuration(tracker);
                if (log.isInfoEnabled())
                    log.info(String.format(CHECKPOINT_STARTED_LOG_FORMAT, chp.cpEntry == null ? "" : chp.cpEntry.checkpointId(), chp.cpEntry == null ? "" : chp.cpEntry.checkpointMark(), tracker.beforeLockDuration(), tracker.lockWaitDuration(), tracker.listenersExecuteDuration(), tracker.lockHoldDuration(), tracker.walCpRecordFsyncDuration(), tracker.writeCheckpointEntryDuration(), tracker.splitAndSortCpPagesDuration(), possibleJvmPauseDur > 0 ? "possibleJvmPauseDuration=" + possibleJvmPauseDur + "ms, " : "", chp.pagesSize, chp.progress.reason()));
            }
            if (!writePages(tracker, chp.cpPages, chp.progress, this, this::isShutdownNow))
                return;
        } else {
            if (log.isInfoEnabled())
                LT.info(log, String.format("Skipping checkpoint (no pages were modified) [" + "checkpointBeforeLockTime=%dms, checkpointLockWait=%dms, " + "checkpointListenersExecuteTime=%dms, checkpointLockHoldTime=%dms, reason='%s']", tracker.beforeLockDuration(), tracker.lockWaitDuration(), tracker.listenersExecuteDuration(), tracker.lockHoldDuration(), chp.progress.reason()));
            tracker.onPagesWriteStart();
            tracker.onFsyncStart();
        }
        snapshotMgr.afterCheckpointPageWritten();
        int destroyedPartitionsCnt = destroyEvictedPartitions();
        // Must mark successful checkpoint only if there are no exceptions or interrupts.
        checkpointWorkflow.markCheckpointEnd(chp);
        tracker.onEnd();
        if (chp.hasDelta() || destroyedPartitionsCnt > 0) {
            if (log.isInfoEnabled()) {
                log.info(String.format("Checkpoint finished [cpId=%s, pages=%d, markPos=%s, " + "walSegmentsCovered=%s, markDuration=%dms, pagesWrite=%dms, fsync=%dms, total=%dms]", chp.cpEntry != null ? chp.cpEntry.checkpointId() : "", chp.pagesSize, chp.cpEntry != null ? chp.cpEntry.checkpointMark() : "", walRangeStr(chp.walSegsCoveredRange), tracker.markDuration(), tracker.pagesWriteDuration(), tracker.fsyncDuration(), tracker.totalDuration()));
            }
        }
        updateMetrics(chp, tracker);
    } catch (IgniteCheckedException e) {
        chp.progress.fail(e);
        failureProcessor.process(new FailureContext(FailureType.CRITICAL_ERROR, e));
    }
}
Also used : IgniteCheckedException(org.apache.ignite.IgniteCheckedException) FailureContext(org.apache.ignite.failure.FailureContext) IgniteException(org.apache.ignite.IgniteException) IgniteCheckedException(org.apache.ignite.IgniteCheckedException) IgniteException(org.apache.ignite.IgniteException) NodeStoppingException(org.apache.ignite.internal.NodeStoppingException) RejectedExecutionException(java.util.concurrent.RejectedExecutionException) IgniteInterruptedCheckedException(org.apache.ignite.internal.IgniteInterruptedCheckedException) CheckpointMetricsTracker(org.apache.ignite.internal.processors.cache.persistence.pagemem.CheckpointMetricsTracker)

Example 39 with FailureContext

use of org.apache.ignite.failure.FailureContext in project ignite by apache.

the class IgniteCacheDatabaseSharedManager method ensureFreeSpaceForInsert.

/**
 * Checks that the given {@code region} has enough space for putting a new entry.
 *
 * This method makes sense then and only then
 * the data region is not persisted {@link DataRegionConfiguration#isPersistenceEnabled()}
 * and page eviction is disabled {@link DataPageEvictionMode#DISABLED}.
 *
 * The non-persistent region should reserve a number of pages to support a free list {@link AbstractFreeList}.
 * For example, removing a row from underlying store may require allocating a new data page
 * in order to move a tracked page from one bucket to another one which does not have a free space for a new stripe.
 * See {@link AbstractFreeList#removeDataRowByLink}.
 * Therefore, inserting a new entry should be prevented in case of some threshold is exceeded.
 *
 * @param region Data region to be checked.
 * @param dataRowSize Size of data row to be inserted.
 * @throws IgniteOutOfMemoryException In case of the given data region does not have enough free space
 * for putting a new entry.
 */
public void ensureFreeSpaceForInsert(DataRegion region, int dataRowSize) throws IgniteOutOfMemoryException {
    if (region == null)
        return;
    DataRegionConfiguration regCfg = region.config();
    if (regCfg.getPageEvictionMode() != DataPageEvictionMode.DISABLED || regCfg.isPersistenceEnabled())
        return;
    long memorySize = regCfg.getMaxSize();
    PageMemory pageMem = region.pageMemory();
    CacheFreeList freeList = freeListMap.get(regCfg.getName());
    long nonEmptyPages = (pageMem.loadedPages() - freeList.emptyDataPages());
    // The maximum number of pages that can be allocated (memorySize / systemPageSize)
    // should be greater or equal to pages required for inserting a new entry plus
    // the current number of non-empty pages plus the number of pages that may be required in order to move
    // all pages to a reuse bucket, that is equal to nonEmptyPages * 8 / pageSize, where 8 is the size of a link.
    // Note that not the whole page can be used to storing links,
    // see PagesListNodeIO and PagesListMetaIO#getCapacity(), so we pessimistically multiply the result on 1.5,
    // in any way, the number of required pages is less than 1 percent.
    boolean oomThreshold = (memorySize / pageMem.systemPageSize()) < ((double) dataRowSize / pageMem.pageSize() + nonEmptyPages * (8.0 * 1.5 / pageMem.pageSize() + 1) + 256);
    if (oomThreshold) {
        IgniteOutOfMemoryException oom = new IgniteOutOfMemoryException("Out of memory in data region [" + "name=" + regCfg.getName() + ", initSize=" + U.readableSize(regCfg.getInitialSize(), false) + ", maxSize=" + U.readableSize(regCfg.getMaxSize(), false) + ", persistenceEnabled=" + regCfg.isPersistenceEnabled() + "] Try the following:" + U.nl() + "  ^-- Increase maximum off-heap memory size (DataRegionConfiguration.maxSize)" + U.nl() + "  ^-- Enable Ignite persistence (DataRegionConfiguration.persistenceEnabled)" + U.nl() + "  ^-- Enable eviction or expiration policies");
        if (cctx.kernalContext() != null)
            cctx.kernalContext().failure().process(new FailureContext(FailureType.CRITICAL_ERROR, oom));
        throw oom;
    }
}
Also used : DataRegionConfiguration(org.apache.ignite.configuration.DataRegionConfiguration) IgniteOutOfMemoryException(org.apache.ignite.internal.mem.IgniteOutOfMemoryException) FailureContext(org.apache.ignite.failure.FailureContext) PageMemory(org.apache.ignite.internal.pagemem.PageMemory) CacheFreeList(org.apache.ignite.internal.processors.cache.persistence.freelist.CacheFreeList)

Example 40 with FailureContext

use of org.apache.ignite.failure.FailureContext in project ignite by apache.

the class TcpDiscoverySpi method onExchange.

/**
 * @param dataPacket object holding discovery data collected during discovery process.
 * @param clsLdr Class loader.
 */
protected void onExchange(DiscoveryDataPacket dataPacket, ClassLoader clsLdr) {
    if (locNode.isDaemon())
        return;
    assert dataPacket != null;
    assert dataPacket.joiningNodeId() != null;
    DiscoveryDataBag dataBag;
    if (dataPacket.joiningNodeId().equals(locNode.id())) {
        try {
            dataBag = dataPacket.unmarshalGridData(marshaller(), clsLdr, locNode.clientRouterNodeId() != null, log);
        } catch (IgniteCheckedException e) {
            if (ignite() instanceof IgniteEx) {
                FailureProcessor failure = ((IgniteEx) ignite()).context().failure();
                failure.process(new FailureContext(CRITICAL_ERROR, e));
            }
            throw new IgniteException(e);
        }
    } else {
        dataBag = dataPacket.unmarshalJoiningNodeDataSilently(marshaller(), clsLdr, locNode.clientRouterNodeId() != null, log);
        // It can be happened due to several nodes, including node without compression support, are trying to join cluster concurrently.
        if (!allNodesSupport(IgniteFeatures.DATA_PACKET_COMPRESSION) && dataPacket.isJoiningDataZipped())
            dataPacket.unzipData(log);
    }
    exchange.onExchange(dataBag);
}
Also used : IgniteCheckedException(org.apache.ignite.IgniteCheckedException) DiscoveryDataBag(org.apache.ignite.spi.discovery.DiscoveryDataBag) FailureContext(org.apache.ignite.failure.FailureContext) IgniteException(org.apache.ignite.IgniteException) IgniteEx(org.apache.ignite.internal.IgniteEx) FailureProcessor(org.apache.ignite.internal.processors.failure.FailureProcessor)

Aggregations

FailureContext (org.apache.ignite.failure.FailureContext)54 IgniteCheckedException (org.apache.ignite.IgniteCheckedException)20 IgniteConfiguration (org.apache.ignite.configuration.IgniteConfiguration)13 Ignite (org.apache.ignite.Ignite)11 IOException (java.io.IOException)9 AbstractFailureHandler (org.apache.ignite.failure.AbstractFailureHandler)9 IgniteEx (org.apache.ignite.internal.IgniteEx)9 GridCommonAbstractTest (org.apache.ignite.testframework.junits.common.GridCommonAbstractTest)9 Test (org.junit.Test)9 IgniteException (org.apache.ignite.IgniteException)8 File (java.io.File)6 DataRegionConfiguration (org.apache.ignite.configuration.DataRegionConfiguration)6 StorageException (org.apache.ignite.internal.processors.cache.persistence.StorageException)6 LogListener (org.apache.ignite.testframework.LogListener)5 WithSystemProperty (org.apache.ignite.testframework.junits.WithSystemProperty)4 ByteBuffer (java.nio.ByteBuffer)3 UUID (java.util.UUID)3 DataStorageConfiguration (org.apache.ignite.configuration.DataStorageConfiguration)3 IgniteInternalFuture (org.apache.ignite.internal.IgniteInternalFuture)3 IgniteInterruptedCheckedException (org.apache.ignite.internal.IgniteInterruptedCheckedException)3