use of io.crate.common.unit.TimeValue in project crate by crate.
the class RecoveryState method toXContent.
@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
builder.field(Fields.ID, shardId.id());
builder.field(Fields.TYPE, recoverySource.getType());
builder.field(Fields.STAGE, stage.toString());
builder.field(Fields.PRIMARY, primary);
builder.timeField(Fields.START_TIME_IN_MILLIS, Fields.START_TIME, timer.startTime);
if (timer.stopTime > 0) {
builder.timeField(Fields.STOP_TIME_IN_MILLIS, Fields.STOP_TIME, timer.stopTime);
}
builder.humanReadableField(Fields.TOTAL_TIME_IN_MILLIS, Fields.TOTAL_TIME, new TimeValue(timer.time()));
if (recoverySource.getType() == RecoverySource.Type.PEER) {
builder.startObject(Fields.SOURCE);
builder.field(Fields.ID, sourceNode.getId());
builder.field(Fields.HOST, sourceNode.getHostName());
builder.field(Fields.TRANSPORT_ADDRESS, sourceNode.getAddress().toString());
builder.field(Fields.IP, sourceNode.getHostAddress());
builder.field(Fields.NAME, sourceNode.getName());
builder.endObject();
} else {
builder.startObject(Fields.SOURCE);
recoverySource.addAdditionalFields(builder, params);
builder.endObject();
}
builder.startObject(Fields.TARGET);
builder.field(Fields.ID, targetNode.getId());
builder.field(Fields.HOST, targetNode.getHostName());
builder.field(Fields.TRANSPORT_ADDRESS, targetNode.getAddress().toString());
builder.field(Fields.IP, targetNode.getHostAddress());
builder.field(Fields.NAME, targetNode.getName());
builder.endObject();
builder.startObject(Fields.INDEX);
index.toXContent(builder, params);
builder.endObject();
builder.startObject(Fields.TRANSLOG);
translog.toXContent(builder, params);
builder.endObject();
builder.startObject(Fields.VERIFY_INDEX);
verifyIndex.toXContent(builder, params);
builder.endObject();
return builder;
}
use of io.crate.common.unit.TimeValue in project crate by crate.
the class RemoteRecoveryTargetHandler method executeRetryableAction.
private <T extends TransportResponse> void executeRetryableAction(String action, TransportRequest request, TransportRequestOptions options, ActionListener<T> actionListener, Writeable.Reader<T> reader) {
final Object key = new Object();
final ActionListener<T> removeListener = ActionListener.runBefore(actionListener, () -> onGoingRetryableActions.remove(key));
final TimeValue initialDelay = TimeValue.timeValueMillis(200);
final TimeValue timeout = recoverySettings.internalActionRetryTimeout();
final RetryableAction<T> retryableAction = new RetryableAction<T>(LOGGER, threadPool, initialDelay, timeout, removeListener) {
@Override
public void tryAction(ActionListener<T> listener) {
transportService.sendRequest(targetNode, action, request, options, new ActionListenerResponseHandler<>(listener, reader, ThreadPool.Names.GENERIC));
}
@Override
public boolean shouldRetry(Exception e) {
return retryableException(e);
}
};
onGoingRetryableActions.put(key, retryableAction);
retryableAction.run();
if (isCancelled) {
retryableAction.cancel(new CancellableThreads.ExecutionCancelledException("recovery was cancelled"));
}
}
use of io.crate.common.unit.TimeValue in project crate by crate.
the class PeerRecoveryTargetService method doRecovery.
private void doRecovery(final long recoveryId) {
final StartRecoveryRequest request;
final RecoveryState.Timer timer;
CancellableThreads cancellableThreads;
try (RecoveryRef recoveryRef = onGoingRecoveries.getRecovery(recoveryId)) {
if (recoveryRef == null) {
LOGGER.trace("not running recovery with id [{}] - can not find it (probably finished)", recoveryId);
return;
}
final RecoveryTarget recoveryTarget = recoveryRef.target();
timer = recoveryTarget.state().getTimer();
cancellableThreads = recoveryTarget.cancellableThreads();
try {
assert recoveryTarget.sourceNode() != null : "can not do a recovery without a source node";
LOGGER.trace("{} preparing shard for peer recovery", recoveryTarget.shardId());
recoveryTarget.indexShard().prepareForIndexRecovery();
final long startingSeqNo = recoveryTarget.indexShard().recoverLocallyUpToGlobalCheckpoint();
assert startingSeqNo == UNASSIGNED_SEQ_NO || recoveryTarget.state().getStage() == RecoveryState.Stage.TRANSLOG : "unexpected recovery stage [" + recoveryTarget.state().getStage() + "] starting seqno [ " + startingSeqNo + "]";
request = getStartRecoveryRequest(LOGGER, clusterService.localNode(), recoveryTarget, startingSeqNo);
} catch (final Exception e) {
// this will be logged as warning later on...
LOGGER.trace("unexpected error while preparing shard for peer recovery, failing recovery", e);
onGoingRecoveries.failRecovery(recoveryId, new RecoveryFailedException(recoveryTarget.state(), "failed to prepare shard for recovery", e), true);
return;
}
}
Consumer<Exception> handleException = e -> {
if (LOGGER.isTraceEnabled()) {
LOGGER.trace(() -> new ParameterizedMessage("[{}][{}] Got exception on recovery", request.shardId().getIndex().getName(), request.shardId().id()), e);
}
Throwable cause = SQLExceptions.unwrap(e);
if (cause instanceof CancellableThreads.ExecutionCancelledException) {
// this can also come from the source wrapped in a RemoteTransportException
onGoingRecoveries.failRecovery(recoveryId, new RecoveryFailedException(request, "source has canceled the recovery", cause), false);
return;
}
if (cause instanceof RecoveryEngineException) {
// unwrap an exception that was thrown as part of the recovery
cause = cause.getCause();
}
// do it twice, in case we have double transport exception
cause = SQLExceptions.unwrap(cause);
if (cause instanceof RecoveryEngineException) {
// unwrap an exception that was thrown as part of the recovery
cause = cause.getCause();
}
if (cause instanceof IllegalIndexShardStateException || cause instanceof IndexNotFoundException || cause instanceof ShardNotFoundException) {
// if the target is not ready yet, retry
retryRecovery(recoveryId, "remote shard not ready", recoverySettings.retryDelayStateSync(), recoverySettings.activityTimeout());
return;
}
if (cause instanceof DelayRecoveryException) {
retryRecovery(recoveryId, cause, recoverySettings.retryDelayStateSync(), recoverySettings.activityTimeout());
return;
}
if (cause instanceof ConnectTransportException) {
LOGGER.debug("delaying recovery of {} for [{}] due to networking error [{}]", request.shardId(), recoverySettings.retryDelayNetwork(), cause.getMessage());
retryRecovery(recoveryId, cause.getMessage(), recoverySettings.retryDelayNetwork(), recoverySettings.activityTimeout());
return;
}
if (cause instanceof AlreadyClosedException) {
onGoingRecoveries.failRecovery(recoveryId, new RecoveryFailedException(request, "source shard is closed", cause), false);
return;
}
onGoingRecoveries.failRecovery(recoveryId, new RecoveryFailedException(request, e), true);
};
try {
LOGGER.trace("{} starting recovery from {}", request.shardId(), request.sourceNode());
cancellableThreads.executeIO(() -> transportService.sendRequest(request.sourceNode(), PeerRecoverySourceService.Actions.START_RECOVERY, request, new TransportResponseHandler<RecoveryResponse>() {
@Override
public void handleResponse(RecoveryResponse recoveryResponse) {
final TimeValue recoveryTime = new TimeValue(timer.time());
// do this through ongoing recoveries to remove it from the collection
onGoingRecoveries.markRecoveryAsDone(recoveryId);
if (LOGGER.isTraceEnabled()) {
StringBuilder sb = new StringBuilder();
sb.append('[').append(request.shardId().getIndex().getName()).append(']').append('[').append(request.shardId().id()).append("] ");
sb.append("recovery completed from ").append(request.sourceNode()).append(", took[").append(recoveryTime).append("]\n");
sb.append(" phase1: recovered_files [").append(recoveryResponse.phase1FileNames.size()).append("]").append(" with total_size of [").append(new ByteSizeValue(recoveryResponse.phase1TotalSize)).append("]").append(", took [").append(timeValueMillis(recoveryResponse.phase1Time)).append("], throttling_wait [").append(timeValueMillis(recoveryResponse.phase1ThrottlingWaitTime)).append(']').append("\n");
sb.append(" : reusing_files [").append(recoveryResponse.phase1ExistingFileNames.size()).append("] with total_size of [").append(new ByteSizeValue(recoveryResponse.phase1ExistingTotalSize)).append("]\n");
sb.append(" phase2: start took [").append(timeValueMillis(recoveryResponse.startTime)).append("]\n");
sb.append(" : recovered [").append(recoveryResponse.phase2Operations).append("]").append(" transaction log operations").append(", took [").append(timeValueMillis(recoveryResponse.phase2Time)).append("]").append("\n");
LOGGER.trace("{}", sb);
} else {
LOGGER.debug("{} recovery done from [{}], took [{}]", request.shardId(), request.sourceNode(), recoveryTime);
}
}
@Override
public void handleException(TransportException e) {
handleException.accept(e);
}
@Override
public String executor() {
// we do some heavy work like refreshes in the response so fork off to the generic threadpool
return ThreadPool.Names.GENERIC;
}
@Override
public RecoveryResponse read(StreamInput in) throws IOException {
return new RecoveryResponse(in);
}
}));
} catch (CancellableThreads.ExecutionCancelledException e) {
LOGGER.trace("recovery cancelled", e);
} catch (Exception e) {
handleException.accept(e);
}
}
use of io.crate.common.unit.TimeValue in project crate by crate.
the class TransportNodesListShardStoreMetadata method listStoreMetadata.
private StoreFilesMetadata listStoreMetadata(NodeRequest request) throws IOException {
final ShardId shardId = request.getShardId();
logger.trace("listing store meta data for {}", shardId);
long startTimeNS = System.nanoTime();
boolean exists = false;
try {
IndexService indexService = indicesService.indexService(shardId.getIndex());
if (indexService != null) {
IndexShard indexShard = indexService.getShardOrNull(shardId.id());
if (indexShard != null) {
try {
final StoreFilesMetadata storeFilesMetadata = new StoreFilesMetadata(shardId, indexShard.snapshotStoreMetadata(), indexShard.getPeerRecoveryRetentionLeases());
exists = true;
return storeFilesMetadata;
} catch (org.apache.lucene.index.IndexNotFoundException e) {
logger.trace(new ParameterizedMessage("[{}] node is missing index, responding with empty", shardId), e);
return new StoreFilesMetadata(shardId, Store.MetadataSnapshot.EMPTY, Collections.emptyList());
} catch (IOException e) {
logger.warn(new ParameterizedMessage("[{}] can't read metadata from store, responding with empty", shardId), e);
return new StoreFilesMetadata(shardId, Store.MetadataSnapshot.EMPTY, Collections.emptyList());
}
}
}
final String customDataPath;
if (request.getCustomDataPath() != null) {
customDataPath = request.getCustomDataPath();
} else {
// TODO: Fallback for BWC with older ES versions. Remove this once request.getCustomDataPath() always returns non-null
if (indexService != null) {
customDataPath = indexService.getIndexSettings().customDataPath();
} else {
IndexMetadata metadata = clusterService.state().metadata().index(shardId.getIndex());
if (metadata != null) {
customDataPath = new IndexSettings(metadata, settings).customDataPath();
} else {
logger.trace("{} node doesn't have meta data for the requests index", shardId);
throw new ElasticsearchException("node doesn't have meta data for index " + shardId.getIndex());
}
}
}
final ShardPath shardPath = ShardPath.loadShardPath(logger, nodeEnv, shardId, customDataPath);
if (shardPath == null) {
return new StoreFilesMetadata(shardId, Store.MetadataSnapshot.EMPTY, Collections.emptyList());
}
// note that this may fail if it can't get access to the shard lock. Since we check above there is an active shard, this means:
// 1) a shard is being constructed, which means the master will not use a copy of this replica
// 2) A shard is shutting down and has not cleared it's content within lock timeout. In this case the master may not
// reuse local resources.
final Store.MetadataSnapshot metadataSnapshot = Store.readMetadataSnapshot(shardPath.resolveIndex(), shardId, nodeEnv::shardLock, logger);
// we refresh shard info after the primary has started. Hence, we can ignore retention leases if there is no active shard.
return new StoreFilesMetadata(shardId, metadataSnapshot, Collections.emptyList());
} finally {
TimeValue took = new TimeValue(System.nanoTime() - startTimeNS, TimeUnit.NANOSECONDS);
if (exists) {
logger.debug("{} loaded store meta data (took [{}])", shardId, took);
} else {
logger.trace("{} didn't find any store meta data to load (took [{}])", shardId, took);
}
}
}
use of io.crate.common.unit.TimeValue in project crate by crate.
the class LimitedBackoffPolicyTest method testNoNext.
@Test
public void testNoNext() throws Exception {
BackoffPolicy policy = new LimitedExponentialBackoff(0, 1, Integer.MAX_VALUE);
Iterator<TimeValue> it = policy.iterator();
it.next();
expectedException.expect(NoSuchElementException.class);
expectedException.expectMessage("Reached maximum amount of backoff iterations. Only 1 iterations allowed.");
it.next();
}
Aggregations