use of org.apache.ignite.failure.FailureContext in project ignite by apache.
the class GridCacheDatabaseSharedManager method restoreBinaryMemory.
/**
* @param cacheGroupsPredicate Cache groups to restore.
* @param recordTypePredicate Filter records by type.
* @return Last seen WAL pointer during binary memory recovery.
* @throws IgniteCheckedException If failed.
*/
private RestoreBinaryState restoreBinaryMemory(IgnitePredicate<Integer> cacheGroupsPredicate, IgniteBiPredicate<WALRecord.RecordType, WALPointer> recordTypePredicate) throws IgniteCheckedException {
long time = System.currentTimeMillis();
try {
if (log.isInfoEnabled())
log.info("Starting binary memory restore for: " + cctx.cache().cacheGroupDescriptors().keySet());
for (DatabaseLifecycleListener lsnr : getDatabaseListeners(cctx.kernalContext())) lsnr.beforeBinaryMemoryRestore(this);
CheckpointStatus status = readCheckpointStatus();
// First, bring memory to the last consistent checkpoint state if needed.
// This method should return a pointer to the last valid record in the WAL.
RestoreBinaryState binaryState = performBinaryMemoryRestore(status, cacheGroupsPredicate, recordTypePredicate, true);
WALPointer restored = binaryState.lastReadRecordPointer();
if (restored.equals(CheckpointStatus.NULL_PTR))
// This record is first
restored = null;
else
restored = restored.next();
if (restored == null && !status.endPtr.equals(CheckpointStatus.NULL_PTR)) {
throw new StorageException("The memory cannot be restored. The critical part of WAL archive is missing " + "[tailWalPtr=" + restored + ", endPtr=" + status.endPtr + ']');
} else if (restored != null)
U.log(log, "Binary memory state restored at node startup [restoredPtr=" + restored + ']');
// Wal logging is now available.
cctx.wal().resumeLogging(restored);
// Log MemoryRecoveryRecord to make sure that old physical records are not replayed during
// next physical recovery.
checkpointManager.memoryRecoveryRecordPtr(cctx.wal().log(new MemoryRecoveryRecord(U.currentTimeMillis())));
for (DatabaseLifecycleListener lsnr : getDatabaseListeners(cctx.kernalContext())) lsnr.afterBinaryMemoryRestore(this, binaryState);
if (log.isInfoEnabled())
log.info("Binary recovery performed in " + (System.currentTimeMillis() - time) + " ms.");
return binaryState;
} catch (IgniteCheckedException e) {
if (X.hasCause(e, StorageException.class, IOException.class))
cctx.kernalContext().failure().process(new FailureContext(FailureType.CRITICAL_ERROR, e));
throw e;
}
}
use of org.apache.ignite.failure.FailureContext in project ignite by apache.
the class Checkpointer method body.
/**
* {@inheritDoc}
*/
@Override
protected void body() {
Throwable err = null;
try {
while (!isCancelled()) {
waitCheckpointEvent();
if (skipCheckpointOnNodeStop && (isCancelled() || shutdownNow)) {
if (log.isInfoEnabled())
log.warning("Skipping last checkpoint because node is stopping.");
return;
}
GridFutureAdapter<Void> enableChangeApplied = this.enableChangeApplied;
if (enableChangeApplied != null) {
enableChangeApplied.onDone();
this.enableChangeApplied = null;
}
if (checkpointsEnabled)
doCheckpoint();
else {
synchronized (this) {
scheduledCp.nextCpNanos(System.nanoTime() + U.millisToNanos(nextCheckpointInterval()));
}
}
}
// Final run after the cancellation.
if (checkpointsEnabled && !shutdownNow)
doCheckpoint();
} catch (Throwable t) {
err = t;
scheduledCp.fail(t);
throw t;
} finally {
if (err == null && !(isCancelled))
err = new IllegalStateException("Thread is terminated unexpectedly: " + name());
if (err instanceof OutOfMemoryError)
failureProcessor.process(new FailureContext(CRITICAL_ERROR, err));
else if (err != null)
failureProcessor.process(new FailureContext(SYSTEM_WORKER_TERMINATION, err));
scheduledCp.fail(new NodeStoppingException("Node is stopping."));
}
}
use of org.apache.ignite.failure.FailureContext in project ignite by apache.
the class Checkpointer method doCheckpoint.
/**
*/
private void doCheckpoint() {
Checkpoint chp = null;
try {
CheckpointMetricsTracker tracker = new CheckpointMetricsTracker();
startCheckpointProgress();
try {
chp = checkpointWorkflow.markCheckpointBegin(lastCpTs, curCpProgress, tracker, this);
} catch (Exception e) {
if (curCpProgress != null)
curCpProgress.fail(e);
// In case of checkpoint initialization error node should be invalidated and stopped.
failureProcessor.process(new FailureContext(FailureType.CRITICAL_ERROR, e));
// Re-throw as unchecked exception to force stopping checkpoint thread.
throw new IgniteException(e);
}
updateHeartbeat();
currentProgress().initCounters(chp.pagesSize);
if (chp.hasDelta()) {
if (log.isInfoEnabled()) {
long possibleJvmPauseDur = possibleLongJvmPauseDuration(tracker);
if (log.isInfoEnabled())
log.info(String.format(CHECKPOINT_STARTED_LOG_FORMAT, chp.cpEntry == null ? "" : chp.cpEntry.checkpointId(), chp.cpEntry == null ? "" : chp.cpEntry.checkpointMark(), tracker.beforeLockDuration(), tracker.lockWaitDuration(), tracker.listenersExecuteDuration(), tracker.lockHoldDuration(), tracker.walCpRecordFsyncDuration(), tracker.writeCheckpointEntryDuration(), tracker.splitAndSortCpPagesDuration(), possibleJvmPauseDur > 0 ? "possibleJvmPauseDuration=" + possibleJvmPauseDur + "ms, " : "", chp.pagesSize, chp.progress.reason()));
}
if (!writePages(tracker, chp.cpPages, chp.progress, this, this::isShutdownNow))
return;
} else {
if (log.isInfoEnabled())
LT.info(log, String.format("Skipping checkpoint (no pages were modified) [" + "checkpointBeforeLockTime=%dms, checkpointLockWait=%dms, " + "checkpointListenersExecuteTime=%dms, checkpointLockHoldTime=%dms, reason='%s']", tracker.beforeLockDuration(), tracker.lockWaitDuration(), tracker.listenersExecuteDuration(), tracker.lockHoldDuration(), chp.progress.reason()));
tracker.onPagesWriteStart();
tracker.onFsyncStart();
}
snapshotMgr.afterCheckpointPageWritten();
int destroyedPartitionsCnt = destroyEvictedPartitions();
// Must mark successful checkpoint only if there are no exceptions or interrupts.
checkpointWorkflow.markCheckpointEnd(chp);
tracker.onEnd();
if (chp.hasDelta() || destroyedPartitionsCnt > 0) {
if (log.isInfoEnabled()) {
log.info(String.format("Checkpoint finished [cpId=%s, pages=%d, markPos=%s, " + "walSegmentsCovered=%s, markDuration=%dms, pagesWrite=%dms, fsync=%dms, total=%dms]", chp.cpEntry != null ? chp.cpEntry.checkpointId() : "", chp.pagesSize, chp.cpEntry != null ? chp.cpEntry.checkpointMark() : "", walRangeStr(chp.walSegsCoveredRange), tracker.markDuration(), tracker.pagesWriteDuration(), tracker.fsyncDuration(), tracker.totalDuration()));
}
}
updateMetrics(chp, tracker);
} catch (IgniteCheckedException e) {
chp.progress.fail(e);
failureProcessor.process(new FailureContext(FailureType.CRITICAL_ERROR, e));
}
}
use of org.apache.ignite.failure.FailureContext in project ignite by apache.
the class IgniteCacheDatabaseSharedManager method ensureFreeSpaceForInsert.
/**
* Checks that the given {@code region} has enough space for putting a new entry.
*
* This method makes sense then and only then
* the data region is not persisted {@link DataRegionConfiguration#isPersistenceEnabled()}
* and page eviction is disabled {@link DataPageEvictionMode#DISABLED}.
*
* The non-persistent region should reserve a number of pages to support a free list {@link AbstractFreeList}.
* For example, removing a row from underlying store may require allocating a new data page
* in order to move a tracked page from one bucket to another one which does not have a free space for a new stripe.
* See {@link AbstractFreeList#removeDataRowByLink}.
* Therefore, inserting a new entry should be prevented in case of some threshold is exceeded.
*
* @param region Data region to be checked.
* @param dataRowSize Size of data row to be inserted.
* @throws IgniteOutOfMemoryException In case of the given data region does not have enough free space
* for putting a new entry.
*/
public void ensureFreeSpaceForInsert(DataRegion region, int dataRowSize) throws IgniteOutOfMemoryException {
if (region == null)
return;
DataRegionConfiguration regCfg = region.config();
if (regCfg.getPageEvictionMode() != DataPageEvictionMode.DISABLED || regCfg.isPersistenceEnabled())
return;
long memorySize = regCfg.getMaxSize();
PageMemory pageMem = region.pageMemory();
CacheFreeList freeList = freeListMap.get(regCfg.getName());
long nonEmptyPages = (pageMem.loadedPages() - freeList.emptyDataPages());
// The maximum number of pages that can be allocated (memorySize / systemPageSize)
// should be greater or equal to pages required for inserting a new entry plus
// the current number of non-empty pages plus the number of pages that may be required in order to move
// all pages to a reuse bucket, that is equal to nonEmptyPages * 8 / pageSize, where 8 is the size of a link.
// Note that not the whole page can be used to storing links,
// see PagesListNodeIO and PagesListMetaIO#getCapacity(), so we pessimistically multiply the result on 1.5,
// in any way, the number of required pages is less than 1 percent.
boolean oomThreshold = (memorySize / pageMem.systemPageSize()) < ((double) dataRowSize / pageMem.pageSize() + nonEmptyPages * (8.0 * 1.5 / pageMem.pageSize() + 1) + 256);
if (oomThreshold) {
IgniteOutOfMemoryException oom = new IgniteOutOfMemoryException("Out of memory in data region [" + "name=" + regCfg.getName() + ", initSize=" + U.readableSize(regCfg.getInitialSize(), false) + ", maxSize=" + U.readableSize(regCfg.getMaxSize(), false) + ", persistenceEnabled=" + regCfg.isPersistenceEnabled() + "] Try the following:" + U.nl() + " ^-- Increase maximum off-heap memory size (DataRegionConfiguration.maxSize)" + U.nl() + " ^-- Enable Ignite persistence (DataRegionConfiguration.persistenceEnabled)" + U.nl() + " ^-- Enable eviction or expiration policies");
if (cctx.kernalContext() != null)
cctx.kernalContext().failure().process(new FailureContext(FailureType.CRITICAL_ERROR, oom));
throw oom;
}
}
use of org.apache.ignite.failure.FailureContext in project ignite by apache.
the class TcpDiscoverySpi method onExchange.
/**
* @param dataPacket object holding discovery data collected during discovery process.
* @param clsLdr Class loader.
*/
protected void onExchange(DiscoveryDataPacket dataPacket, ClassLoader clsLdr) {
if (locNode.isDaemon())
return;
assert dataPacket != null;
assert dataPacket.joiningNodeId() != null;
DiscoveryDataBag dataBag;
if (dataPacket.joiningNodeId().equals(locNode.id())) {
try {
dataBag = dataPacket.unmarshalGridData(marshaller(), clsLdr, locNode.clientRouterNodeId() != null, log);
} catch (IgniteCheckedException e) {
if (ignite() instanceof IgniteEx) {
FailureProcessor failure = ((IgniteEx) ignite()).context().failure();
failure.process(new FailureContext(CRITICAL_ERROR, e));
}
throw new IgniteException(e);
}
} else {
dataBag = dataPacket.unmarshalJoiningNodeDataSilently(marshaller(), clsLdr, locNode.clientRouterNodeId() != null, log);
// It can be happened due to several nodes, including node without compression support, are trying to join cluster concurrently.
if (!allNodesSupport(IgniteFeatures.DATA_PACKET_COMPRESSION) && dataPacket.isJoiningDataZipped())
dataPacket.unzipData(log);
}
exchange.onExchange(dataBag);
}
Aggregations