use of org.elasticsearch.action.ActionListener in project crate by crate.
the class RecoverySourceHandlerTests method testCancelRecoveryDuringPhase1.
@Test
public void testCancelRecoveryDuringPhase1() throws Exception {
Store store = newStore(createTempDir("source"), false);
IndexShard shard = mock(IndexShard.class);
when(shard.store()).thenReturn(store);
Directory dir = store.directory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, newIndexWriterConfig());
int numDocs = randomIntBetween(10, 100);
for (int i = 0; i < numDocs; i++) {
Document document = new Document();
document.add(new StringField("id", Integer.toString(i), Field.Store.YES));
document.add(newField("field", randomUnicodeOfCodepointLengthBetween(1, 10), TextField.TYPE_STORED));
writer.addDocument(document);
}
writer.commit();
writer.close();
AtomicBoolean wasCancelled = new AtomicBoolean();
SetOnce<Runnable> cancelRecovery = new SetOnce<>();
final TestRecoveryTargetHandler recoveryTarget = new TestRecoveryTargetHandler() {
@Override
public void receiveFileInfo(List<String> phase1FileNames, List<Long> phase1FileSizes, List<String> phase1ExistingFileNames, List<Long> phase1ExistingFileSizes, int totalTranslogOps, ActionListener<Void> listener) {
recoveryExecutor.execute(() -> listener.onResponse(null));
if (randomBoolean()) {
wasCancelled.set(true);
cancelRecovery.get().run();
}
}
@Override
public void writeFileChunk(StoreFileMetadata md, long position, BytesReference content, boolean lastChunk, int totalTranslogOps, ActionListener<Void> listener) {
recoveryExecutor.execute(() -> listener.onResponse(null));
if (rarely()) {
wasCancelled.set(true);
cancelRecovery.get().run();
}
}
@Override
public void cleanFiles(int totalTranslogOps, long globalCheckpoint, Store.MetadataSnapshot sourceMetadata, ActionListener<Void> listener) {
recoveryExecutor.execute(() -> listener.onResponse(null));
if (randomBoolean()) {
wasCancelled.set(true);
cancelRecovery.get().run();
}
}
};
StartRecoveryRequest startRecoveryRequest = getStartRecoveryRequest();
final RecoverySourceHandler handler = new RecoverySourceHandler(shard, recoveryTarget, threadPool, startRecoveryRequest, between(1, 16), between(1, 4), between(1, 4)) {
@Override
void createRetentionLease(long startingSeqNo, ActionListener<RetentionLease> listener) {
final String leaseId = ReplicationTracker.getPeerRecoveryRetentionLeaseId(startRecoveryRequest.targetNode().getId());
listener.onResponse(new RetentionLease(leaseId, startingSeqNo, threadPool.absoluteTimeInMillis(), ReplicationTracker.PEER_RECOVERY_RETENTION_LEASE_SOURCE));
}
};
cancelRecovery.set(() -> handler.cancel("test"));
final StepListener<RecoverySourceHandler.SendFileResult> phase1Listener = new StepListener<>();
try {
final CountDownLatch latch = new CountDownLatch(1);
handler.phase1(DirectoryReader.listCommits(dir).get(0), 0, () -> 0, new LatchedActionListener<>(phase1Listener, latch));
latch.await();
phase1Listener.result();
} catch (Exception e) {
assertTrue(wasCancelled.get());
assertNotNull(ExceptionsHelper.unwrap(e, CancellableThreads.ExecutionCancelledException.class));
}
store.close();
}
use of org.elasticsearch.action.ActionListener in project crate by crate.
the class HandshakingTransportAddressConnector method connectToRemoteMasterNode.
@Override
public void connectToRemoteMasterNode(TransportAddress transportAddress, ActionListener<DiscoveryNode> listener) {
transportService.getThreadPool().generic().execute(new ActionRunnable<>(listener) {
@Override
protected void doRun() {
// TODO if transportService is already connected to this address then skip the handshaking
final DiscoveryNode targetNode = new DiscoveryNode("", transportAddress.toString(), // generated deterministically for reproducible tests
UUIDs.randomBase64UUID(Randomness.get()), transportAddress.address().getHostString(), transportAddress.getAddress(), transportAddress, emptyMap(), emptySet(), Version.CURRENT.minimumCompatibilityVersion());
LOGGER.trace("[{}] opening probe connection", this);
transportService.openConnection(targetNode, ConnectionProfile.buildSingleChannelProfile(Type.REG, probeConnectTimeout, probeHandshakeTimeout, TimeValue.MINUS_ONE, null), new ActionListener<>() {
@Override
public void onResponse(Connection connection) {
LOGGER.trace("[{}] opened probe connection", this);
// use NotifyOnceListener to make sure the following line does not result in onFailure being called when
// the connection is closed in the onResponse handler
transportService.handshake(connection, probeHandshakeTimeout.millis(), new NotifyOnceListener<DiscoveryNode>() {
@Override
protected void innerOnResponse(DiscoveryNode remoteNode) {
try {
// success means (amongst other things) that the cluster names match
LOGGER.trace("[{}] handshake successful: {}", this, remoteNode);
IOUtils.closeWhileHandlingException(connection);
if (remoteNode.equals(transportService.getLocalNode())) {
// TODO cache this result for some time? forever?
listener.onFailure(new ConnectTransportException(remoteNode, "local node found"));
} else if (remoteNode.isMasterEligibleNode() == false) {
// TODO cache this result for some time?
listener.onFailure(new ConnectTransportException(remoteNode, "non-master-eligible node found"));
} else {
transportService.connectToNode(remoteNode, new ActionListener<Void>() {
@Override
public void onResponse(Void ignored) {
LOGGER.trace("[{}] full connection successful: {}", this, remoteNode);
listener.onResponse(remoteNode);
}
@Override
public void onFailure(Exception e) {
listener.onFailure(e);
}
});
}
} catch (Exception e) {
listener.onFailure(e);
}
}
@Override
protected void innerOnFailure(Exception e) {
// we opened a connection and successfully performed a low-level handshake, so we were definitely
// talking to an Elasticsearch node, but the high-level handshake failed indicating some kind of
// mismatched configurations (e.g. cluster name) that the user should address
LOGGER.warn(new ParameterizedMessage("handshake failed for [{}]", this), e);
IOUtils.closeWhileHandlingException(connection);
listener.onFailure(e);
}
});
}
@Override
public void onFailure(Exception e) {
listener.onFailure(e);
}
});
}
@Override
public String toString() {
return "connectToRemoteMasterNode[" + transportAddress + "]";
}
});
}
use of org.elasticsearch.action.ActionListener in project crate by crate.
the class RecoverySourceHandler method recoverToTarget.
/**
* performs the recovery from the local engine to the target
*/
public void recoverToTarget(ActionListener<RecoveryResponse> listener) {
final Closeable releaseResources = () -> IOUtils.close(resources);
final ActionListener<RecoveryResponse> wrappedListener = ActionListener.notifyOnce(listener);
try {
cancellableThreads.setOnCancel((reason, beforeCancelEx) -> {
final RuntimeException e;
if (shard.state() == IndexShardState.CLOSED) {
// check if the shard got closed on us
e = new IndexShardClosedException(shard.shardId(), "shard is closed and recovery was canceled reason [" + reason + "]");
} else {
e = new CancellableThreads.ExecutionCancelledException("recovery was canceled reason [" + reason + "]");
}
if (beforeCancelEx != null) {
e.addSuppressed(beforeCancelEx);
}
IOUtils.closeWhileHandlingException(releaseResources, () -> wrappedListener.onFailure(e));
throw e;
});
final Consumer<Exception> onFailure = e -> {
assert Transports.assertNotTransportThread(RecoverySourceHandler.this + "[onFailure]");
IOUtils.closeWhileHandlingException(releaseResources, () -> wrappedListener.onFailure(e));
};
final boolean softDeletesEnabled = shard.indexSettings().isSoftDeleteEnabled();
final SetOnce<RetentionLease> retentionLeaseRef = new SetOnce<>();
runUnderPrimaryPermit(() -> {
final IndexShardRoutingTable routingTable = shard.getReplicationGroup().getRoutingTable();
ShardRouting targetShardRouting = routingTable.getByAllocationId(request.targetAllocationId());
if (targetShardRouting == null) {
logger.debug("delaying recovery of {} as it is not listed as assigned to target node {}", request.shardId(), request.targetNode());
throw new DelayRecoveryException("source node does not have the shard listed in its state as allocated on the node");
}
assert targetShardRouting.initializing() : "expected recovery target to be initializing but was " + targetShardRouting;
retentionLeaseRef.set(shard.getRetentionLeases().get(ReplicationTracker.getPeerRecoveryRetentionLeaseId(targetShardRouting)));
}, shardId + " validating recovery target [" + request.targetAllocationId() + "] registered ", shard, cancellableThreads, logger);
final Engine.HistorySource historySource;
if (softDeletesEnabled && (shard.useRetentionLeasesInPeerRecovery() || retentionLeaseRef.get() != null)) {
historySource = Engine.HistorySource.INDEX;
} else {
historySource = Engine.HistorySource.TRANSLOG;
}
final Closeable retentionLock = shard.acquireHistoryRetentionLock(historySource);
resources.add(retentionLock);
final long startingSeqNo;
final boolean isSequenceNumberBasedRecovery = request.startingSeqNo() != SequenceNumbers.UNASSIGNED_SEQ_NO && isTargetSameHistory() && shard.hasCompleteHistoryOperations("peer-recovery", historySource, request.startingSeqNo()) && (historySource == Engine.HistorySource.TRANSLOG || (retentionLeaseRef.get() != null && retentionLeaseRef.get().retainingSequenceNumber() <= request.startingSeqNo()));
if (isSequenceNumberBasedRecovery && softDeletesEnabled && retentionLeaseRef.get() != null) {
// all the history we need is retained by an existing retention lease, so we do not need a separate retention lock
retentionLock.close();
logger.trace("history is retained by {}", retentionLeaseRef.get());
} else {
// all the history we need is retained by the retention lock, obtained before calling shard.hasCompleteHistoryOperations()
// and before acquiring the safe commit we'll be using, so we can be certain that all operations after the safe commit's
// local checkpoint will be retained for the duration of this recovery.
logger.trace("history is retained by retention lock");
}
final StepListener<SendFileResult> sendFileStep = new StepListener<>();
final StepListener<TimeValue> prepareEngineStep = new StepListener<>();
final StepListener<SendSnapshotResult> sendSnapshotStep = new StepListener<>();
final StepListener<Void> finalizeStep = new StepListener<>();
if (isSequenceNumberBasedRecovery) {
logger.trace("performing sequence numbers based recovery. starting at [{}]", request.startingSeqNo());
startingSeqNo = request.startingSeqNo();
if (retentionLeaseRef.get() == null) {
createRetentionLease(startingSeqNo, ActionListener.map(sendFileStep, ignored -> SendFileResult.EMPTY));
} else {
sendFileStep.onResponse(SendFileResult.EMPTY);
}
} else {
final Engine.IndexCommitRef safeCommitRef;
try {
safeCommitRef = shard.acquireSafeIndexCommit();
resources.add(safeCommitRef);
} catch (final Exception e) {
throw new RecoveryEngineException(shard.shardId(), 1, "snapshot failed", e);
}
// Try and copy enough operations to the recovering peer so that if it is promoted to primary then it has a chance of being
// able to recover other replicas using operations-based recoveries. If we are not using retention leases then we
// conservatively copy all available operations. If we are using retention leases then "enough operations" is just the
// operations from the local checkpoint of the safe commit onwards, because when using soft deletes the safe commit retains
// at least as much history as anything else. The safe commit will often contain all the history retained by the current set
// of retention leases, but this is not guaranteed: an earlier peer recovery from a different primary might have created a
// retention lease for some history that this primary already discarded, since we discard history when the global checkpoint
// advances and not when creating a new safe commit. In any case this is a best-effort thing since future recoveries can
// always fall back to file-based ones, and only really presents a problem if this primary fails before things have settled
// down.
startingSeqNo = softDeletesEnabled ? Long.parseLong(safeCommitRef.getIndexCommit().getUserData().get(SequenceNumbers.LOCAL_CHECKPOINT_KEY)) + 1L : 0;
logger.trace("performing file-based recovery followed by history replay starting at [{}]", startingSeqNo);
try {
final int estimateNumOps = shard.estimateNumberOfHistoryOperations("peer-recovery", historySource, startingSeqNo);
final Releasable releaseStore = acquireStore(shard.store());
resources.add(releaseStore);
sendFileStep.whenComplete(r -> IOUtils.close(safeCommitRef, releaseStore), e -> {
try {
IOUtils.close(safeCommitRef, releaseStore);
} catch (final IOException ex) {
logger.warn("releasing snapshot caused exception", ex);
}
});
final StepListener<ReplicationResponse> deleteRetentionLeaseStep = new StepListener<>();
runUnderPrimaryPermit(() -> {
try {
// If the target previously had a copy of this shard then a file-based recovery might move its global
// checkpoint backwards. We must therefore remove any existing retention lease so that we can create a
// new one later on in the recovery.
shard.removePeerRecoveryRetentionLease(request.targetNode().getId(), new ThreadedActionListener<>(logger, shard.getThreadPool(), ThreadPool.Names.GENERIC, deleteRetentionLeaseStep, false));
} catch (RetentionLeaseNotFoundException e) {
logger.debug("no peer-recovery retention lease for " + request.targetAllocationId());
deleteRetentionLeaseStep.onResponse(null);
}
}, shardId + " removing retention lease for [" + request.targetAllocationId() + "]", shard, cancellableThreads, logger);
deleteRetentionLeaseStep.whenComplete(ignored -> {
assert Transports.assertNotTransportThread(RecoverySourceHandler.this + "[phase1]");
phase1(safeCommitRef.getIndexCommit(), startingSeqNo, () -> estimateNumOps, sendFileStep);
}, onFailure);
} catch (final Exception e) {
throw new RecoveryEngineException(shard.shardId(), 1, "sendFileStep failed", e);
}
}
assert startingSeqNo >= 0 : "startingSeqNo must be non negative. got: " + startingSeqNo;
sendFileStep.whenComplete(r -> {
assert Transports.assertNotTransportThread(RecoverySourceHandler.this + "[prepareTargetForTranslog]");
// For a sequence based recovery, the target can keep its local translog
prepareTargetForTranslog(shard.estimateNumberOfHistoryOperations("peer-recovery", historySource, startingSeqNo), prepareEngineStep);
}, onFailure);
prepareEngineStep.whenComplete(prepareEngineTime -> {
assert Transports.assertNotTransportThread(RecoverySourceHandler.this + "[phase2]");
/*
* add shard to replication group (shard will receive replication requests from this point on)
* now that engine is open. This means that any document indexed into the primary after
* this will be replicated to this replica as well make sure to do this before sampling
* the max sequence number in the next step, to ensure that we send all documents up to
* maxSeqNo in phase2.
*/
runUnderPrimaryPermit(() -> shard.initiateTracking(request.targetAllocationId()), shardId + " initiating tracking of " + request.targetAllocationId(), shard, cancellableThreads, logger);
final long endingSeqNo = shard.seqNoStats().getMaxSeqNo();
// CRATE_PATCH
try {
blobRecoveryHook();
} catch (Exception e) {
throw new RecoveryEngineException(shard.shardId(), 1, "blobRecoveryHook failed", e);
}
if (logger.isTraceEnabled()) {
logger.trace("snapshot translog for recovery; current size is [{}]", shard.estimateNumberOfHistoryOperations("peer-recovery", historySource, startingSeqNo));
}
final Translog.Snapshot phase2Snapshot = shard.getHistoryOperations("peer-recovery", historySource, startingSeqNo);
resources.add(phase2Snapshot);
retentionLock.close();
// we have to capture the max_seen_auto_id_timestamp and the max_seq_no_of_updates to make sure that these values
// are at least as high as the corresponding values on the primary when any of these operations were executed on it.
final long maxSeenAutoIdTimestamp = shard.getMaxSeenAutoIdTimestamp();
final long maxSeqNoOfUpdatesOrDeletes = shard.getMaxSeqNoOfUpdatesOrDeletes();
final RetentionLeases retentionLeases = shard.getRetentionLeases();
final long mappingVersionOnPrimary = shard.indexSettings().getIndexMetadata().getMappingVersion();
phase2(startingSeqNo, endingSeqNo, phase2Snapshot, maxSeenAutoIdTimestamp, maxSeqNoOfUpdatesOrDeletes, retentionLeases, mappingVersionOnPrimary, sendSnapshotStep);
}, onFailure);
// Recovery target can trim all operations >= startingSeqNo as we have sent all these operations in the phase 2
final long trimAboveSeqNo = startingSeqNo - 1;
sendSnapshotStep.whenComplete(r -> finalizeRecovery(r.targetLocalCheckpoint, trimAboveSeqNo, finalizeStep), onFailure);
finalizeStep.whenComplete(r -> {
// TODO: return the actual throttle time
final long phase1ThrottlingWaitTime = 0L;
final SendSnapshotResult sendSnapshotResult = sendSnapshotStep.result();
final SendFileResult sendFileResult = sendFileStep.result();
final RecoveryResponse response = new RecoveryResponse(sendFileResult.phase1FileNames, sendFileResult.phase1FileSizes, sendFileResult.phase1ExistingFileNames, sendFileResult.phase1ExistingFileSizes, sendFileResult.totalSize, sendFileResult.existingTotalSize, sendFileResult.took.millis(), phase1ThrottlingWaitTime, prepareEngineStep.result().millis(), sendSnapshotResult.sentOperations, sendSnapshotResult.tookTime.millis());
try {
wrappedListener.onResponse(response);
} finally {
IOUtils.close(resources);
}
}, onFailure);
} catch (Exception e) {
IOUtils.closeWhileHandlingException(releaseResources, () -> wrappedListener.onFailure(e));
}
}
use of org.elasticsearch.action.ActionListener in project crate by crate.
the class RemoteRecoveryTargetHandler method finalizeRecovery.
@Override
public void finalizeRecovery(final long globalCheckpoint, final long trimAboveSeqNo, final ActionListener<Void> listener) {
final String action = PeerRecoveryTargetService.Actions.FINALIZE;
final RecoveryFinalizeRecoveryRequest request = new RecoveryFinalizeRecoveryRequest(recoveryId, shardId, globalCheckpoint, trimAboveSeqNo);
final TransportRequestOptions options = TransportRequestOptions.builder().withTimeout(recoverySettings.internalActionLongTimeout()).build();
final Writeable.Reader<TransportResponse.Empty> reader = in -> TransportResponse.Empty.INSTANCE;
final ActionListener<TransportResponse.Empty> responseListener = ActionListener.map(listener, r -> null);
executeRetryableAction(action, request, options, responseListener, reader);
}
use of org.elasticsearch.action.ActionListener in project crate by crate.
the class RemoteRecoveryTargetHandler method receiveFileInfo.
@Override
public void receiveFileInfo(List<String> phase1FileNames, List<Long> phase1FileSizes, List<String> phase1ExistingFileNames, List<Long> phase1ExistingFileSizes, int totalTranslogOps, ActionListener<Void> listener) {
final String action = PeerRecoveryTargetService.Actions.FILES_INFO;
RecoveryFilesInfoRequest request = new RecoveryFilesInfoRequest(recoveryId, shardId, phase1FileNames, phase1FileSizes, phase1ExistingFileNames, phase1ExistingFileSizes, totalTranslogOps);
final TransportRequestOptions options = TransportRequestOptions.builder().withTimeout(recoverySettings.internalActionTimeout()).build();
final Writeable.Reader<TransportResponse.Empty> reader = in -> TransportResponse.Empty.INSTANCE;
final ActionListener<TransportResponse.Empty> responseListener = ActionListener.map(listener, r -> null);
executeRetryableAction(action, request, options, responseListener, reader);
}
Aggregations