use of org.apache.ignite.internal.processors.cache.distributed.dht.topology.GridDhtLocalPartition in project ignite by apache.
the class GridCacheDatabaseSharedManager method applyLogicalUpdates.
/**
* @param status Last registered checkpoint status.
* @param restoreMeta Metastore restore phase if {@code true}.
* @throws IgniteCheckedException If failed to apply updates.
* @throws StorageException If IO exception occurred while reading write-ahead log.
*/
private RestoreLogicalState applyLogicalUpdates(CheckpointStatus status, IgnitePredicate<Integer> cacheGroupsPredicate, IgniteBiPredicate<WALRecord.RecordType, WALPointer> recordTypePredicate, boolean restoreMeta) throws IgniteCheckedException {
if (log.isInfoEnabled())
log.info("Applying lost " + (restoreMeta ? "metastore" : "cache") + " updates since last checkpoint record [lastMarked=" + status.startPtr + ", lastCheckpointId=" + status.cpStartId + ']');
if (!restoreMeta)
cctx.kernalContext().query().skipFieldLookup(true);
long start = U.currentTimeMillis();
AtomicReference<Throwable> applyError = new AtomicReference<>();
AtomicLong applied = new AtomicLong();
long lastArchivedSegment = cctx.wal().lastArchivedSegment();
StripedExecutor exec = cctx.kernalContext().pools().getStripedExecutorService();
Semaphore semaphore = new Semaphore(semaphorePertmits(exec));
Map<GroupPartitionId, Integer> partitionRecoveryStates = new HashMap<>();
WALIterator it = cctx.wal().replay(status.startPtr, recordTypePredicate);
RestoreLogicalState restoreLogicalState = new RestoreLogicalState(status, it, lastArchivedSegment, cacheGroupsPredicate, partitionRecoveryStates);
final IgniteTxManager txManager = cctx.tm();
try {
while (restoreLogicalState.hasNext()) {
WALRecord rec = restoreLogicalState.next();
if (rec == null)
break;
switch(rec.type()) {
case TX_RECORD:
if (restoreMeta) {
// Also restore tx states.
TxRecord txRec = (TxRecord) rec;
txManager.collectTxStates(txRec);
}
break;
case // Calculate initial partition states
CHECKPOINT_RECORD:
CheckpointRecord cpRec = (CheckpointRecord) rec;
for (Map.Entry<Integer, CacheState> entry : cpRec.cacheGroupStates().entrySet()) {
CacheState cacheState = entry.getValue();
for (int i = 0; i < cacheState.size(); i++) {
int partId = cacheState.partitionByIndex(i);
byte state = cacheState.stateByIndex(i);
// Ignore undefined state.
if (state != -1) {
partitionRecoveryStates.put(new GroupPartitionId(entry.getKey(), partId), (int) state);
}
}
}
break;
case ROLLBACK_TX_RECORD:
RollbackRecord rbRec = (RollbackRecord) rec;
CacheGroupContext ctx = cctx.cache().cacheGroup(rbRec.groupId());
if (ctx != null && !ctx.isLocal()) {
GridDhtLocalPartition part = ctx.topology().forceCreatePartition(rbRec.partitionId());
ctx.offheap().dataStore(part).updateInitialCounter(rbRec.start(), rbRec.range());
}
break;
case MVCC_DATA_RECORD:
case DATA_RECORD:
case DATA_RECORD_V2:
case ENCRYPTED_DATA_RECORD:
case ENCRYPTED_DATA_RECORD_V2:
case ENCRYPTED_DATA_RECORD_V3:
DataRecord dataRec = (DataRecord) rec;
int entryCnt = dataRec.entryCount();
for (int i = 0; i < entryCnt; i++) {
DataEntry dataEntry = dataRec.get(i);
if (!restoreMeta && txManager.uncommitedTx(dataEntry))
continue;
int cacheId = dataEntry.cacheId();
DynamicCacheDescriptor cacheDesc = cctx.cache().cacheDescriptor(cacheId);
// Can empty in case recovery node on blt changed.
if (cacheDesc == null)
continue;
stripedApply(() -> {
GridCacheContext cacheCtx = cctx.cacheContext(cacheId);
if (skipRemovedIndexUpdates(cacheCtx.groupId(), PageIdAllocator.INDEX_PARTITION))
cctx.kernalContext().query().markAsRebuildNeeded(cacheCtx, true);
try {
applyUpdate(cacheCtx, dataEntry);
} catch (IgniteCheckedException e) {
U.error(log, "Failed to apply data entry, dataEntry=" + dataEntry + ", ptr=" + dataRec.position());
applyError.compareAndSet(null, e);
}
applied.incrementAndGet();
}, cacheDesc.groupId(), dataEntry.partitionId(), exec, semaphore);
}
break;
case MVCC_TX_RECORD:
MvccTxRecord txRecord = (MvccTxRecord) rec;
byte txState = convertToTxState(txRecord.state());
cctx.coordinators().updateState(txRecord.mvccVersion(), txState, true);
break;
case PART_META_UPDATE_STATE:
PartitionMetaStateRecord metaStateRecord = (PartitionMetaStateRecord) rec;
GroupPartitionId groupPartitionId = new GroupPartitionId(metaStateRecord.groupId(), metaStateRecord.partitionId());
restoreLogicalState.partitionRecoveryStates.put(groupPartitionId, (int) metaStateRecord.state());
break;
case METASTORE_DATA_RECORD:
MetastoreDataRecord metastoreDataRecord = (MetastoreDataRecord) rec;
metaStorage.applyUpdate(metastoreDataRecord.key(), metastoreDataRecord.value());
break;
case META_PAGE_UPDATE_NEXT_SNAPSHOT_ID:
case META_PAGE_UPDATE_LAST_SUCCESSFUL_SNAPSHOT_ID:
case META_PAGE_UPDATE_LAST_SUCCESSFUL_FULL_SNAPSHOT_ID:
case META_PAGE_UPDATE_LAST_ALLOCATED_INDEX:
PageDeltaRecord pageDelta = (PageDeltaRecord) rec;
stripedApplyPage((pageMem) -> {
try {
applyPageDelta(pageMem, pageDelta, false);
} catch (IgniteCheckedException e) {
U.error(log, "Failed to apply page delta, " + pageDelta);
applyError.compareAndSet(null, e);
}
}, pageDelta.groupId(), partId(pageDelta.pageId()), exec, semaphore);
break;
case MASTER_KEY_CHANGE_RECORD_V2:
cctx.kernalContext().encryption().applyKeys((MasterKeyChangeRecordV2) rec);
break;
case REENCRYPTION_START_RECORD:
cctx.kernalContext().encryption().applyReencryptionStartRecord((ReencryptionStartRecord) rec);
break;
case INDEX_ROOT_PAGE_RENAME_RECORD:
IndexRenameRootPageRecord record = (IndexRenameRootPageRecord) rec;
int cacheId = record.cacheId();
GridCacheContext cacheCtx = cctx.cacheContext(cacheId);
if (cacheCtx != null) {
IgniteCacheOffheapManager offheap = cacheCtx.offheap();
for (int i = 0; i < record.segments(); i++) offheap.renameRootPageForIndex(cacheId, record.oldTreeName(), record.newTreeName(), i);
}
break;
case PARTITION_CLEARING_START_RECORD:
PartitionClearingStartRecord rec0 = (PartitionClearingStartRecord) rec;
CacheGroupContext grp = this.ctx.cache().cacheGroup(rec0.groupId());
if (grp != null) {
GridDhtLocalPartition part;
try {
part = grp.topology().forceCreatePartition(rec0.partitionId());
} catch (IgniteCheckedException e) {
throw new IgniteException("Cannot get or create a partition [groupId=" + rec0.groupId() + ", partitionId=" + rec0.partitionId() + "]", e);
}
stripedApply(() -> {
try {
part.updateClearVersion(rec0.clearVersion());
IgniteInternalFuture<?> clearFut = grp.shared().evict().evictPartitionAsync(grp, part, new GridFutureAdapter<>());
clearFut.get();
part.updateClearVersion();
} catch (IgniteCheckedException e) {
U.error(log, "Failed to apply partition clearing record, " + rec0);
applyError.compareAndSet(null, e);
}
}, rec0.groupId(), rec0.partitionId(), exec, semaphore);
}
break;
default:
}
}
} finally {
it.close();
if (!restoreMeta)
cctx.kernalContext().query().skipFieldLookup(false);
}
awaitApplyComplete(exec, applyError);
if (log.isInfoEnabled())
log.info("Finished applying WAL changes [updatesApplied=" + applied + ", time=" + (U.currentTimeMillis() - start) + " ms]");
for (DatabaseLifecycleListener lsnr : getDatabaseListeners(cctx.kernalContext())) lsnr.afterLogicalUpdatesApplied(this, restoreLogicalState);
return restoreLogicalState;
}
use of org.apache.ignite.internal.processors.cache.distributed.dht.topology.GridDhtLocalPartition in project ignite by apache.
the class GridCacheOffheapManager method saveStoreMetadata.
/**
* @param store Store to save metadata.
* @throws IgniteCheckedException If failed.
*/
private void saveStoreMetadata(CacheDataStore store, Context ctx, boolean beforeDestroy, boolean needSnapshot) throws IgniteCheckedException {
RowStore rowStore0 = store.rowStore();
if (rowStore0 != null && (partitionStatesRestored || grp.isLocal())) {
((CacheFreeList) rowStore0.freeList()).saveMetadata(grp.statisticsHolderData());
PartitionMetaStorage<SimpleDataRow> partStore = store.partStorage();
long updCntr = store.updateCounter();
long size = store.fullSize();
long rmvId = globalRemoveId().get();
byte[] updCntrsBytes = store.partUpdateCounter().getBytes();
PageMemoryEx pageMem = (PageMemoryEx) grp.dataRegion().pageMemory();
IgniteWriteAheadLogManager wal = this.ctx.wal();
GridEncryptionManager encMgr = this.ctx.kernalContext().encryption();
if (size > 0 || updCntr > 0 || !store.partUpdateCounter().sequential() || (grp.config().isEncryptionEnabled() && encMgr.getEncryptionState(grp.groupId(), store.partId()) > 0)) {
GridDhtPartitionState state = null;
// localPartition will not acquire writeLock here because create=false.
GridDhtLocalPartition part = null;
if (!grp.isLocal()) {
if (beforeDestroy)
state = GridDhtPartitionState.EVICTED;
else {
part = getPartition(store);
if (part != null && part.state() != GridDhtPartitionState.EVICTED)
state = part.state();
}
// Do not save meta for evicted partitions on next checkpoints.
if (state == null)
return;
}
int grpId = grp.groupId();
long partMetaId = pageMem.partitionMetaPageId(grpId, store.partId());
long partMetaPage = pageMem.acquirePage(grpId, partMetaId);
try {
long partMetaPageAddr = pageMem.writeLock(grpId, partMetaId, partMetaPage);
if (partMetaPageAddr == 0L) {
U.warn(log, "Failed to acquire write lock for meta page [metaPage=" + partMetaPage + ", beforeDestroy=" + beforeDestroy + ", size=" + size + ", updCntr=" + updCntr + ", state=" + state + ']');
return;
}
boolean changed = false;
try {
PagePartitionMetaIOV3 io = PageIO.getPageIO(partMetaPageAddr);
long link = io.getGapsLink(partMetaPageAddr);
if (updCntrsBytes == null && link != 0) {
partStore.removeDataRowByLink(link, grp.statisticsHolderData());
io.setGapsLink(partMetaPageAddr, (link = 0));
changed = true;
} else if (updCntrsBytes != null && link == 0) {
SimpleDataRow row = new SimpleDataRow(store.partId(), updCntrsBytes);
partStore.insertDataRow(row, grp.statisticsHolderData());
io.setGapsLink(partMetaPageAddr, (link = row.link()));
changed = true;
} else if (updCntrsBytes != null && link != 0) {
byte[] prev = partStore.readRow(link);
assert prev != null : "Read null gaps using link=" + link;
if (!Arrays.equals(prev, updCntrsBytes)) {
partStore.removeDataRowByLink(link, grp.statisticsHolderData());
SimpleDataRow row = new SimpleDataRow(store.partId(), updCntrsBytes);
partStore.insertDataRow(row, grp.statisticsHolderData());
io.setGapsLink(partMetaPageAddr, (link = row.link()));
changed = true;
}
}
if (changed)
partStore.saveMetadata(grp.statisticsHolderData());
changed |= io.setUpdateCounter(partMetaPageAddr, updCntr);
changed |= io.setGlobalRemoveId(partMetaPageAddr, rmvId);
changed |= io.setSize(partMetaPageAddr, size);
int encryptIdx = 0;
int encryptCnt = 0;
if (grp.config().isEncryptionEnabled()) {
long reencryptState = encMgr.getEncryptionState(grpId, store.partId());
if (reencryptState != 0) {
encryptIdx = ReencryptStateUtils.pageIndex(reencryptState);
encryptCnt = ReencryptStateUtils.pageCount(reencryptState);
if (encryptIdx == encryptCnt) {
encMgr.setEncryptionState(grp, store.partId(), 0, 0);
encryptIdx = encryptCnt = 0;
}
changed |= io.setEncryptedPageIndex(partMetaPageAddr, encryptIdx);
changed |= io.setEncryptedPageCount(partMetaPageAddr, encryptCnt);
}
}
if (state != null)
changed |= io.setPartitionState(partMetaPageAddr, (byte) state.ordinal());
else
assert grp.isLocal() : grp.cacheOrGroupName();
long cntrsPageId;
if (grp.sharedGroup()) {
long initCntrPageId = io.getCountersPageId(partMetaPageAddr);
Map<Integer, Long> newSizes = store.cacheSizes();
Map<Integer, Long> prevSizes = readSharedGroupCacheSizes(pageMem, grpId, initCntrPageId);
if (prevSizes != null && prevSizes.equals(newSizes))
// Preventing modification of sizes pages for store
cntrsPageId = initCntrPageId;
else {
cntrsPageId = writeSharedGroupCacheSizes(pageMem, grpId, initCntrPageId, store.partId(), newSizes);
if (initCntrPageId == 0 && cntrsPageId != 0) {
io.setCountersPageId(partMetaPageAddr, cntrsPageId);
changed = true;
}
}
} else
cntrsPageId = 0L;
int pageCnt;
if (needSnapshot) {
pageCnt = this.ctx.pageStore().pages(grpId, store.partId());
io.setCandidatePageCount(partMetaPageAddr, size == 0 ? 0 : pageCnt);
if (state == OWNING) {
assert part != null;
if (!addPartition(part, ctx.partitionStatMap(), partMetaPageAddr, io, grpId, store.partId(), this.ctx.pageStore().pages(grpId, store.partId()), store.fullSize()))
U.warn(log, "Partition was concurrently evicted grpId=" + grpId + ", partitionId=" + part.id());
} else if (state == MOVING || state == RENTING) {
if (ctx.partitionStatMap().forceSkipIndexPartition(grpId)) {
if (log.isInfoEnabled())
log.info("Will not include SQL indexes to snapshot because there is " + "a partition not in " + OWNING + " state [grp=" + grp.cacheOrGroupName() + ", partId=" + store.partId() + ", state=" + state + ']');
}
}
changed = true;
} else
pageCnt = io.getCandidatePageCount(partMetaPageAddr);
if (changed && isWalDeltaRecordNeeded(pageMem, grpId, partMetaId, partMetaPage, wal, null))
wal.log(new MetaPageUpdatePartitionDataRecordV3(grpId, partMetaId, updCntr, rmvId, // TODO: Partition size may be long
(int) size, cntrsPageId, state == null ? -1 : (byte) state.ordinal(), pageCnt, link, encryptIdx, encryptCnt));
if (changed) {
partStore.saveMetadata(grp.statisticsHolderData());
io.setPartitionMetaStoreReuseListRoot(partMetaPageAddr, partStore.metaPageId());
}
} finally {
pageMem.writeUnlock(grpId, partMetaId, partMetaPage, null, changed);
}
} finally {
pageMem.releasePage(grpId, partMetaId, partMetaPage);
}
} else if (needSnapshot)
tryAddEmptyPartitionToSnapshot(store, ctx);
} else if (needSnapshot)
tryAddEmptyPartitionToSnapshot(store, ctx);
}
use of org.apache.ignite.internal.processors.cache.distributed.dht.topology.GridDhtLocalPartition in project ignite by apache.
the class GridCacheOffheapManager method preloadPartition.
/**
* {@inheritDoc}
*/
@Override
public void preloadPartition(int partId) throws IgniteCheckedException {
if (grp.isLocal()) {
dataStore(null).preload();
return;
}
GridDhtLocalPartition locPart = grp.topology().localPartition(partId, AffinityTopologyVersion.NONE, false, false);
assert locPart != null && locPart.reservations() > 0;
locPart.dataStore().preload();
}
use of org.apache.ignite.internal.processors.cache.distributed.dht.topology.GridDhtLocalPartition in project ignite by apache.
the class GridCacheOffheapManager method restoreStateOfPartition.
/**
* {@inheritDoc}
*/
@Override
public long restoreStateOfPartition(int p, @Nullable Integer recoveryState) throws IgniteCheckedException {
if (grp.isLocal() || !grp.affinityNode() || !grp.dataRegion().config().isPersistenceEnabled() || partitionStatesRestored)
return 0;
PageMemoryEx pageMem = (PageMemoryEx) grp.dataRegion().pageMemory();
long startTime = U.currentTimeMillis();
long res = 0;
if (log.isDebugEnabled())
log.debug("Started restoring partition state [grp=" + grp.cacheOrGroupName() + ", p=" + p + ']');
if (ctx.pageStore().exists(grp.groupId(), p)) {
ctx.pageStore().ensure(grp.groupId(), p);
if (ctx.pageStore().pages(grp.groupId(), p) <= 1) {
if (log.isDebugEnabled()) {
log.debug("Skipping partition on recovery (pages less than or equals 1) " + "[grp=" + grp.cacheOrGroupName() + ", p=" + p + ']');
}
return 0;
}
if (log.isDebugEnabled()) {
log.debug("Creating partition on recovery (exists in page store) " + "[grp=" + grp.cacheOrGroupName() + ", p=" + p + ']');
}
GridDhtLocalPartition part = grp.topology().forceCreatePartition(p);
// Triggers initialization of existing(having datafile) partition before acquiring cp read lock.
part.dataStore().init();
ctx.database().checkpointReadLock();
try {
long partMetaId = pageMem.partitionMetaPageId(grp.groupId(), p);
long partMetaPage = pageMem.acquirePage(grp.groupId(), partMetaId);
try {
long pageAddr = pageMem.writeLock(grp.groupId(), partMetaId, partMetaPage);
boolean changed = false;
try {
PagePartitionMetaIO io = PagePartitionMetaIO.VERSIONS.forPage(pageAddr);
if (recoveryState != null) {
changed = io.setPartitionState(pageAddr, (byte) recoveryState.intValue());
updateState(part, recoveryState);
if (log.isDebugEnabled()) {
log.debug("Restored partition state (from WAL) " + "[grp=" + grp.cacheOrGroupName() + ", p=" + p + ", state=" + part.state() + ", updCntr=" + part.initialUpdateCounter() + ", size=" + part.fullSize() + ']');
}
} else {
int stateId = io.getPartitionState(pageAddr);
updateState(part, stateId);
if (log.isDebugEnabled()) {
log.debug("Restored partition state (from page memory) " + "[grp=" + grp.cacheOrGroupName() + ", p=" + p + ", state=" + part.state() + ", updCntr=" + part.initialUpdateCounter() + ", stateId=" + stateId + ", size=" + part.fullSize() + ']');
}
}
} finally {
pageMem.writeUnlock(grp.groupId(), partMetaId, partMetaPage, null, changed);
}
} finally {
pageMem.releasePage(grp.groupId(), partMetaId, partMetaPage);
}
} finally {
ctx.database().checkpointReadUnlock();
}
res = U.currentTimeMillis() - startTime;
} else if (recoveryState != null) {
// Pre-create partition if having valid state.
GridDhtLocalPartition part = grp.topology().forceCreatePartition(p);
updateState(part, recoveryState);
res = U.currentTimeMillis() - startTime;
if (log.isDebugEnabled()) {
log.debug("Restored partition state (from WAL) " + "[grp=" + grp.cacheOrGroupName() + ", p=" + p + ", state=" + part.state() + ", updCntr=" + part.initialUpdateCounter() + ", size=" + part.fullSize() + ']');
}
} else {
if (log.isDebugEnabled()) {
log.debug("Skipping partition on recovery (no page store OR wal state) " + "[grp=" + grp.cacheOrGroupName() + ", p=" + p + ']');
}
}
if (log.isDebugEnabled()) {
log.debug("Finished restoring partition state " + "[grp=" + grp.cacheOrGroupName() + ", p=" + p + ", time=" + U.humanReadableDuration(U.currentTimeMillis() - startTime) + ']');
}
return res;
}
use of org.apache.ignite.internal.processors.cache.distributed.dht.topology.GridDhtLocalPartition in project ignite by apache.
the class IgniteTxLocalAdapter method calculatePartitionUpdateCounters.
/**
* Calculates partition update counters for current transaction. Each partition will be supplied with
* pair (init, delta) values, where init - initial update counter, and delta - updates count made
* by current transaction for a given partition.
*/
public void calculatePartitionUpdateCounters() throws IgniteTxRollbackCheckedException {
TxCounters counters = txCounters(false);
if (counters != null && F.isEmpty(counters.updateCounters())) {
List<PartitionUpdateCountersMessage> cntrMsgs = new ArrayList<>();
for (Map.Entry<Integer, Map<Integer, AtomicLong>> record : counters.accumulatedUpdateCounters().entrySet()) {
int cacheId = record.getKey();
Map<Integer, AtomicLong> partToCntrs = record.getValue();
assert partToCntrs != null;
if (F.isEmpty(partToCntrs))
continue;
PartitionUpdateCountersMessage msg = new PartitionUpdateCountersMessage(cacheId, partToCntrs.size());
GridCacheContext ctx0 = cctx.cacheContext(cacheId);
GridDhtPartitionTopology top = ctx0.topology();
assert top != null;
for (Map.Entry<Integer, AtomicLong> e : partToCntrs.entrySet()) {
AtomicLong acc = e.getValue();
assert acc != null;
long cntr = acc.get();
assert cntr >= 0;
if (cntr != 0) {
int p = e.getKey();
GridDhtLocalPartition part = top.localPartition(p);
// Verify primary tx mapping.
// LOST state is possible if tx is started over LOST partition.
boolean valid = part != null && (part.state() == OWNING || part.state() == LOST) && part.primary(top.readyTopologyVersion());
if (!valid) {
// Local node is no longer primary for the partition, need to rollback a transaction.
if (part != null && !part.primary(top.readyTopologyVersion())) {
log.warning("Failed to prepare a transaction on outdated topology, rolling back " + "[tx=" + CU.txString(this) + ", readyTopVer=" + top.readyTopologyVersion() + ", lostParts=" + top.lostPartitions() + ", part=" + part.toString() + ']');
throw new IgniteTxRollbackCheckedException("Failed to prepare a transaction on outdated " + "topology, please try again [timeout=" + timeout() + ", tx=" + CU.txString(this) + ']');
}
// Trigger error.
throw new AssertionError("Invalid primary mapping [tx=" + CU.txString(this) + ", readyTopVer=" + top.readyTopologyVersion() + ", lostParts=" + top.lostPartitions() + ", part=" + (part == null ? "NULL" : part.toString()) + ']');
}
msg.add(p, part.getAndIncrementUpdateCounter(cntr), cntr);
}
}
if (msg.size() > 0)
cntrMsgs.add(msg);
}
counters.updateCounters(cntrMsgs);
}
}
Aggregations