use of org.elasticsearch.index.seqno.RetentionLeases in project crate by crate.
the class IndexShard method syncRetentionLeases.
* Syncs the current retention leases to all replicas.
public void syncRetentionLeases() {
assert assertPrimaryMode();
final Tuple<Boolean, RetentionLeases> retentionLeases = getRetentionLeases(true);
if (retentionLeases.v1()) {
logger.trace("syncing retention leases [{}] after expiration check", retentionLeases.v2());
retentionLeaseSyncer.sync(shardId, shardRouting.allocationId().getId(), getPendingPrimaryTerm(), retentionLeases.v2(), ActionListener.wrap(r -> {
}, e -> logger.warn(new ParameterizedMessage("failed to sync retention leases [{}] after expiration check", retentionLeases), e)));
} else {
logger.trace("background syncing retention leases [{}] after expiration check", retentionLeases.v2());
retentionLeaseSyncer.backgroundSync(shardId, shardRouting.allocationId().getId(), getPendingPrimaryTerm(), retentionLeases.v2());
use of org.elasticsearch.index.seqno.RetentionLeases in project crate by crate.
the class IndexShard method updateShardState.
public void updateShardState(final ShardRouting newRouting, final long newPrimaryTerm, final BiConsumer<IndexShard, ActionListener<ResyncTask>> primaryReplicaSyncer, final long applyingClusterStateVersion, final Set<String> inSyncAllocationIds, final IndexShardRoutingTable routingTable) throws IOException {
final ShardRouting currentRouting;
synchronized (mutex) {
currentRouting = this.shardRouting;
assert currentRouting != null : "shardRouting must not be null";
if (!newRouting.shardId().equals(shardId())) {
throw new IllegalArgumentException("Trying to set a routing entry with shardId " + newRouting.shardId() + " on a shard with shardId " + shardId());
if (newRouting.isSameAllocation(currentRouting) == false) {
throw new IllegalArgumentException("Trying to set a routing entry with a different allocation. Current " + currentRouting + ", new " + newRouting);
if (currentRouting.primary() && newRouting.primary() == false) {
throw new IllegalArgumentException("illegal state: trying to move shard from primary mode to replica mode. Current " + currentRouting + ", new " + newRouting);
if (newRouting.primary()) {
replicationTracker.updateFromMaster(applyingClusterStateVersion, inSyncAllocationIds, routingTable);
if (state == IndexShardState.POST_RECOVERY && {
assert == false : "we are in POST_RECOVERY, but our shard routing is active " + currentRouting;
assert currentRouting.isRelocationTarget() == false || currentRouting.primary() == false || replicationTracker.isPrimaryMode() : "a primary relocation is completed by the master, but primary mode is not active " + currentRouting;
changeState(IndexShardState.STARTED, "global state is [" + newRouting.state() + "]");
} else if (currentRouting.primary() && currentRouting.relocating() && replicationTracker.isRelocated() && (newRouting.relocating() == false || newRouting.equalsIgnoringMetadata(currentRouting) == false)) {
// active primaries.
throw new IndexShardRelocatedException(shardId(), "Shard is marked as relocated, cannot safely move to state " + newRouting.state());
assert == false || state == IndexShardState.STARTED || state == IndexShardState.CLOSED : "routing is active, but local shard state isn't. routing: " + newRouting + ", local state: " + state;
persistMetadata(path, indexSettings, newRouting, currentRouting, logger);
final CountDownLatch shardStateUpdated = new CountDownLatch(1);
if (newRouting.primary()) {
if (newPrimaryTerm == pendingPrimaryTerm) {
if (currentRouting.initializing() && {
if (currentRouting.isRelocationTarget() == false) {
// the master started a recovering primary, activate primary mode.
} else {
assert currentRouting.primary() == false : "term is only increased as part of primary promotion";
/* Note that due to cluster state batching an initializing primary shard term can failed and re-assigned
* in one state causing it's term to be incremented. Note that if both current shard state and new
* shard state are initializing, we could replace the current shard and reinitialize it. It is however
* possible that this shard is being started. This can happen if:
* 1) Shard is post recovery and sends shard started to the master
* 2) Node gets disconnected and rejoins
* 3) Master assigns the shard back to the node
* 4) Master processes the shard started and starts the shard
* 5) The node process the cluster state where the shard is both started and primary term is incremented.
* We could fail the shard in that case, but this will cause it to be removed from the insync allocations list
* potentially preventing re-allocation.
assert newRouting.initializing() == false : "a started primary shard should never update its term; " + "shard " + newRouting + ", " + "current term [" + pendingPrimaryTerm + "], " + "new term [" + newPrimaryTerm + "]";
assert newPrimaryTerm > pendingPrimaryTerm : "primary terms can only go up; current term [" + pendingPrimaryTerm + "], new term [" + newPrimaryTerm + "]";
* Before this call returns, we are guaranteed that all future operations are delayed and so this happens before we
* increment the primary term. The latch is needed to ensure that we do not unblock operations before the primary term is
* incremented.
// to prevent primary relocation handoff while resync is not completed
boolean resyncStarted = primaryReplicaResyncInProgress.compareAndSet(false, true);
if (resyncStarted == false) {
throw new IllegalStateException("cannot start resync while it's already in progress");
bumpPrimaryTerm(newPrimaryTerm, () -> {
assert pendingPrimaryTerm == newPrimaryTerm : "shard term changed on primary. expected [" + newPrimaryTerm + "] but was [" + pendingPrimaryTerm + "]" + ", current routing: " + currentRouting + ", new routing: " + newRouting;
assert getOperationPrimaryTerm() == newPrimaryTerm;
try {
* If this shard was serving as a replica shard when another shard was promoted to primary then
* its Lucene index was reset during the primary term transition. In particular, the Lucene index
* on this shard was reset to the global checkpoint and the operations above the local checkpoint
* were reverted. If the other shard that was promoted to primary subsequently fails before the
* primary/replica re-sync completes successfully and we are now being promoted, we have to restore
* the reverted operations on this shard by replaying the translog to avoid losing acknowledged writes.
final Engine engine = getEngine();
engine.restoreLocalHistoryFromTranslog((resettingEngine, snapshot) -> runTranslogRecovery(resettingEngine, snapshot, Engine.Operation.Origin.LOCAL_RESET, () -> {
if (indexSettings.getIndexVersionCreated().onOrBefore(Version.V_3_0_1)) {
// an index that was created before sequence numbers were introduced may contain operations in its
// translog that do not have a sequence numbers. We want to make sure those operations will never
// be replayed as part of peer recovery to avoid an arbitrary mixture of operations with seq# (due
// to active indexing) and operations without a seq# coming from the translog. We therefore flush
// to create a lucene commit point to an empty translog file.
engine.flush(false, true);
/* Rolling the translog generation is not strictly needed here (as we will never have collisions between
* sequence numbers in a translog generation in a new primary as it takes the last known sequence number
* as a starting point), but it simplifies reasoning about the relationship between primary terms and
* translog generations.
replicationTracker.updateLocalCheckpoint(currentRouting.allocationId().getId(), getLocalCheckpoint());
primaryReplicaSyncer.accept(this, new ActionListener<ResyncTask>() {
public void onResponse(ResyncTask resyncTask) {"primary-replica resync completed with {} operations", resyncTask.getResyncedOperations());
boolean resyncCompleted = primaryReplicaResyncInProgress.compareAndSet(true, false);
assert resyncCompleted : "primary-replica resync finished but was not started";
public void onFailure(Exception e) {
boolean resyncCompleted = primaryReplicaResyncInProgress.compareAndSet(true, false);
assert resyncCompleted : "primary-replica resync finished but was not started";
if (state == IndexShardState.CLOSED) {
// ignore, shutting down
} else {
failShard("exception during primary-replica resync", e);
} catch (final AlreadyClosedException e) {
// okay, the index was deleted
}, null);
// set this last, once we finished updating all internal state.
this.shardRouting = newRouting;
assert this.shardRouting.primary() == false || // note that we use started and not active to avoid relocating shards
this.shardRouting.started() == false || // if permits are blocked, we are still transitioning
this.indexShardOperationPermits.isBlocked() || this.replicationTracker.isPrimaryMode() : "a started primary with non-pending operation term must be in primary mode " + this.shardRouting;
if ( == false && {
if (newRouting.equals(currentRouting) == false) {
indexEventListener.shardRoutingChanged(this, currentRouting, newRouting);
if (indexSettings.isSoftDeleteEnabled() && useRetentionLeasesInPeerRecovery == false) {
final RetentionLeases retentionLeases = replicationTracker.getRetentionLeases();
final Set<ShardRouting> shardRoutings = new HashSet<>(routingTable.getShards());
// include relocation targets
if ( -> shr.assignedToNode() && retentionLeases.contains(ReplicationTracker.getPeerRecoveryRetentionLeaseId(shr)))) {
useRetentionLeasesInPeerRecovery = true;
use of org.elasticsearch.index.seqno.RetentionLeases in project crate by crate.
the class EngineTestCase method config.
public EngineConfig config(IndexSettings indexSettings, Store store, Path translogPath, MergePolicy mergePolicy, ReferenceManager.RefreshListener externalRefreshListener, ReferenceManager.RefreshListener internalRefreshListener, @Nullable LongSupplier maybeGlobalCheckpointSupplier, @Nullable Supplier<RetentionLeases> maybeRetentionLeasesSupplier) {
IndexWriterConfig iwc = newIndexWriterConfig();
TranslogConfig translogConfig = new TranslogConfig(shardId, translogPath, indexSettings, BigArrays.NON_RECYCLING_INSTANCE);
Engine.EventListener eventListener = new Engine.EventListener() {
public void onFailedEngine(String reason, @Nullable Exception e) {
// we don't need to notify anybody in this test
final List<ReferenceManager.RefreshListener> extRefreshListenerList = externalRefreshListener == null ? emptyList() : Collections.singletonList(externalRefreshListener);
final List<ReferenceManager.RefreshListener> intRefreshListenerList = internalRefreshListener == null ? emptyList() : Collections.singletonList(internalRefreshListener);
final LongSupplier globalCheckpointSupplier;
final Supplier<RetentionLeases> retentionLeasesSupplier;
if (maybeGlobalCheckpointSupplier == null) {
assert maybeRetentionLeasesSupplier == null;
final ReplicationTracker replicationTracker = new ReplicationTracker(shardId, allocationId.getId(), indexSettings, randomNonNegativeLong(), SequenceNumbers.NO_OPS_PERFORMED, update -> {
}, () -> 0L, (leases, listener) -> {
}, () -> SafeCommitInfo.EMPTY);
globalCheckpointSupplier = replicationTracker;
retentionLeasesSupplier = replicationTracker::getRetentionLeases;
} else {
assert maybeRetentionLeasesSupplier != null;
globalCheckpointSupplier = maybeGlobalCheckpointSupplier;
retentionLeasesSupplier = maybeRetentionLeasesSupplier;
return new EngineConfig(shardId, allocationId.getId(), threadPool, indexSettings, store, mergePolicy, iwc.getAnalyzer(), new CodecService(null, logger), eventListener, IndexSearcher.getDefaultQueryCache(), IndexSearcher.getDefaultQueryCachingPolicy(), translogConfig, TimeValue.timeValueMinutes(5), extRefreshListenerList, intRefreshListenerList, new NoneCircuitBreakerService(), globalCheckpointSupplier, retentionLeasesSupplier, primaryTerm, tombstoneDocSupplier());
use of org.elasticsearch.index.seqno.RetentionLeases in project crate by crate.
the class RecoverySourceHandlerTests method testSendOperationsConcurrently.
public void testSendOperationsConcurrently() throws Throwable {
final IndexShard shard = mock(IndexShard.class);
Set<Long> receivedSeqNos = ConcurrentCollections.newConcurrentSet();
long maxSeenAutoIdTimestamp = randomBoolean() ? -1 : randomNonNegativeLong();
long maxSeqNoOfUpdatesOrDeletes = randomBoolean() ? -1 : randomNonNegativeLong();
RetentionLeases retentionLeases = new RetentionLeases(randomNonNegativeLong(), randomNonNegativeLong(), Collections.emptySet());
long mappingVersion = randomNonNegativeLong();
AtomicLong localCheckpoint = new AtomicLong(SequenceNumbers.NO_OPS_PERFORMED);
int numOps = randomIntBetween(0, 1000);
AtomicBoolean received = new AtomicBoolean();
RecoveryTargetHandler target = new TestRecoveryTargetHandler() {
public void indexTranslogOperations(List<Translog.Operation> operations, int receivedTotalOps, long receivedMaxSeenAutoIdTimestamp, long receivedMaxSeqNoOfUpdatesOrDeletes, RetentionLeases receivedRetentionLease, long receivedMappingVersion, ActionListener<Long> listener) {
assertThat(receivedMaxSeenAutoIdTimestamp, equalTo(maxSeenAutoIdTimestamp));
assertThat(receivedMaxSeqNoOfUpdatesOrDeletes, equalTo(maxSeqNoOfUpdatesOrDeletes));
assertThat(receivedRetentionLease, equalTo(retentionLeases));
assertThat(receivedMappingVersion, equalTo(mappingVersion));
assertThat(receivedTotalOps, equalTo(numOps));
for (Translog.Operation operation : operations) {
if (randomBoolean()) {
localCheckpoint.addAndGet(randomIntBetween(1, 100));
PlainActionFuture<RecoverySourceHandler.SendSnapshotResult> sendFuture = new PlainActionFuture<>();
long startingSeqNo = randomIntBetween(0, 1000);
long endingSeqNo = startingSeqNo + randomIntBetween(0, 10000);
List<Translog.Operation> operations = generateOperations(numOps);
List<Translog.Operation> skipOperations = randomSubsetOf(operations);
Translog.Snapshot snapshot = newTranslogSnapshot(operations, skipOperations);
RecoverySourceHandler handler = new RecoverySourceHandler(shard, new AsyncRecoveryTarget(target, recoveryExecutor), threadPool, getStartRecoveryRequest(), between(1, 10 * 1024), between(1, 5), between(1, 5));
handler.phase2(startingSeqNo, endingSeqNo, snapshot, maxSeenAutoIdTimestamp, maxSeqNoOfUpdatesOrDeletes, retentionLeases, mappingVersion, sendFuture);
RecoverySourceHandler.SendSnapshotResult sendSnapshotResult = sendFuture.actionGet();
assertThat(sendSnapshotResult.targetLocalCheckpoint, equalTo(localCheckpoint.get()));
assertThat(sendSnapshotResult.sentOperations, equalTo(receivedSeqNos.size()));
Set<Long> sentSeqNos = new HashSet<>();
for (Translog.Operation op : operations) {
if (startingSeqNo <= op.seqNo() && op.seqNo() <= endingSeqNo && skipOperations.contains(op) == false) {
assertThat(receivedSeqNos, equalTo(sentSeqNos));
use of org.elasticsearch.index.seqno.RetentionLeases in project crate by crate.
the class RecoverySourceHandlerTests method testSendSnapshotStopOnError.
public void testSendSnapshotStopOnError() throws Exception {
final int fileChunkSizeInBytes = between(1, 10 * 1024);
final StartRecoveryRequest request = getStartRecoveryRequest();
final IndexShard shard = mock(IndexShard.class);
final List<Translog.Operation> ops = new ArrayList<>();
for (int numOps = between(1, 256), i = 0; i < numOps; i++) {
final Engine.Index index = getIndex(Integer.toString(i));
ops.add(new Translog.Index(index, new Engine.IndexResult(1, 1, i, true)));
final AtomicBoolean wasFailed = new AtomicBoolean();
RecoveryTargetHandler recoveryTarget = new TestRecoveryTargetHandler() {
public void indexTranslogOperations(List<Translog.Operation> operations, int totalTranslogOps, long timestamp, long msu, RetentionLeases retentionLeases, long mappingVersion, ActionListener<Long> listener) {
if (randomBoolean()) {
} else {
listener.onFailure(new RuntimeException("test - failed to index"));
RecoverySourceHandler handler = new RecoverySourceHandler(shard, new AsyncRecoveryTarget(recoveryTarget, threadPool.generic()), threadPool, request, fileChunkSizeInBytes, between(1, 10), between(1, 10));
PlainActionFuture<RecoverySourceHandler.SendSnapshotResult> future = new PlainActionFuture<>();
final long startingSeqNo = randomLongBetween(0, ops.size() - 1L);
final long endingSeqNo = randomLongBetween(startingSeqNo, ops.size() - 1L);
handler.phase2(startingSeqNo, endingSeqNo, newTranslogSnapshot(ops, Collections.emptyList()), randomNonNegativeLong(), randomNonNegativeLong(), RetentionLeases.EMPTY, randomNonNegativeLong(), future);
if (wasFailed.get()) {
final RecoveryEngineException error = expectThrows(RecoveryEngineException.class, future::actionGet);
assertThat(error.getMessage(), equalTo("Phase[2] failed to send/replay operations"));
assertThat(error.getCause().getMessage(), equalTo("test - failed to index"));