use of com.github.ambry.clustermap.DataNodeId in project ambry by linkedin.
the class Http2NetworkClient method warmUpConnections.
@Override
public int warmUpConnections(List<DataNodeId> dataNodeIds, int connectionWarmUpPercentagePerDataNode, long timeForWarmUp, List<ResponseInfo> responseInfoList) {
long startTime = System.currentTimeMillis();
AtomicInteger successCount = new AtomicInteger();
AtomicInteger failCount = new AtomicInteger();
int warmUpConnectionPerPort = http2ClientConfig.http2MinConnectionPerPort * connectionWarmUpPercentagePerDataNode / 100;
int expectedConnections = dataNodeIds.size() * warmUpConnectionPerPort;
for (DataNodeId dataNodeId : dataNodeIds) {
for (int i = 0; i < warmUpConnectionPerPort; i++) {
this.pools.get(InetSocketAddress.createUnresolved(dataNodeId.getHostname(), dataNodeId.getHttp2Port())).acquire().addListener((GenericFutureListener<Future<Channel>>) future -> {
if (future.isSuccess()) {
Channel streamChannel = future.getNow();
releaseAndCloseStreamChannel(streamChannel);
successCount.incrementAndGet();
} else {
failCount.incrementAndGet();
responseInfoList.add(new ResponseInfo(null, NetworkClientErrorCode.NetworkError, null, dataNodeId));
logger.error("Couldn't acquire stream channel to {}:{} . Cause: {}.", dataNodeId.getHostname(), dataNodeId.getHttp2Port(), future.cause());
}
});
}
}
while (System.currentTimeMillis() - startTime < timeForWarmUp) {
if (successCount.get() + failCount.get() == expectedConnections) {
break;
} else {
try {
Thread.sleep(300);
} catch (InterruptedException e) {
break;
}
}
}
logger.info("HTTP2 connection warm up done. Tried: {}, Succeeded: {}, Failed: {}, Time elapsed: {} ms", expectedConnections, successCount, failCount, System.currentTimeMillis() - startTime);
return successCount.get();
}
use of com.github.ambry.clustermap.DataNodeId in project ambry by linkedin.
the class ReplicaThread method exchangeMetadata.
/**
* Gets all the metadata about messages from the remote replicas since last token. Checks the messages with the local
* store and finds all the messages that are missing. For the messages that are not missing, updates the delete
* and ttl state.
* @param connectedChannel The connected channel that represents a connection to the remote replica
* @param replicasToReplicatePerNode The information about the replicas that is being replicated
* @return - List of ExchangeMetadataResponse that contains the set of store keys that are missing from the local
* store and are present in the remote replicas and also the new token from the remote replicas
* @throws IOException
* @throws ReplicationException
*/
List<ExchangeMetadataResponse> exchangeMetadata(ConnectedChannel connectedChannel, List<RemoteReplicaInfo> replicasToReplicatePerNode) throws IOException, ReplicationException {
long exchangeMetadataStartTimeInMs = time.milliseconds();
List<ExchangeMetadataResponse> exchangeMetadataResponseList = new ArrayList<>();
if (replicasToReplicatePerNode.size() > 0) {
try {
DataNodeId remoteNode = replicasToReplicatePerNode.get(0).getReplicaId().getDataNodeId();
ReplicaMetadataResponse response = getReplicaMetadataResponse(replicasToReplicatePerNode, connectedChannel, remoteNode);
long startTimeInMs = time.milliseconds();
Map<StoreKey, StoreKey> remoteKeyToLocalKeyMap = batchConvertReplicaMetadataResponseKeys(response);
for (int i = 0; i < response.getReplicaMetadataResponseInfoList().size(); i++) {
RemoteReplicaInfo remoteReplicaInfo = replicasToReplicatePerNode.get(i);
ReplicaMetadataResponseInfo replicaMetadataResponseInfo = response.getReplicaMetadataResponseInfoList().get(i);
responseHandler.onEvent(remoteReplicaInfo.getReplicaId(), replicaMetadataResponseInfo.getError());
if (replicaMetadataResponseInfo.getError() == ServerErrorCode.No_Error) {
// Skip stores that were stopped during call to getReplicaMetadataResponse
if (!remoteReplicaInfo.getLocalStore().isStarted()) {
exchangeMetadataResponseList.add(new ExchangeMetadataResponse(ServerErrorCode.Temporarily_Disabled));
} else {
try {
logger.trace("Remote node: {} Thread name: {} Remote replica: {} Token from remote: {} Replica lag: {} ", remoteNode, threadName, remoteReplicaInfo.getReplicaId(), replicaMetadataResponseInfo.getFindToken(), replicaMetadataResponseInfo.getRemoteReplicaLagInBytes());
Set<MessageInfo> remoteMissingStoreMessages = getMissingStoreMessages(replicaMetadataResponseInfo, remoteNode, remoteReplicaInfo);
processReplicaMetadataResponse(remoteMissingStoreMessages, replicaMetadataResponseInfo, remoteReplicaInfo, remoteNode, remoteKeyToLocalKeyMap);
// Get the converted keys for the missing keys of this replica (to store them along with missing keys in
// the exchange metadata response). For leader based replication, these are used during processing
// of missing keys for non-leader replica pairs which will come later via leader<->leader replication.
Map<StoreKey, StoreKey> remoteKeyToLocalKeySubMap = new HashMap<>();
remoteMissingStoreMessages.forEach(remoteMissingStoreMessage -> {
StoreKey remoteKey = remoteMissingStoreMessage.getStoreKey();
remoteKeyToLocalKeySubMap.put(remoteKey, remoteKeyToLocalKeyMap.get(remoteKey));
});
ExchangeMetadataResponse exchangeMetadataResponse = new ExchangeMetadataResponse(remoteMissingStoreMessages, replicaMetadataResponseInfo.getFindToken(), replicaMetadataResponseInfo.getRemoteReplicaLagInBytes(), remoteKeyToLocalKeySubMap, time);
// update replication lag in ReplicaSyncUpManager
if (replicaSyncUpManager != null && remoteReplicaInfo.getLocalStore().getCurrentState() == ReplicaState.BOOTSTRAP) {
ReplicaId localReplica = remoteReplicaInfo.getLocalReplicaId();
ReplicaId remoteReplica = remoteReplicaInfo.getReplicaId();
boolean isSyncCompleted = replicaSyncUpManager.updateReplicaLagAndCheckSyncStatus(localReplica, remoteReplica, exchangeMetadataResponse.localLagFromRemoteInBytes, ReplicaState.STANDBY);
// if catchup is completed by this update call, we can complete bootstrap in local store
if (isSyncCompleted) {
// complete BOOTSTRAP -> STANDBY transition
remoteReplicaInfo.getLocalStore().setCurrentState(ReplicaState.STANDBY);
remoteReplicaInfo.getLocalStore().completeBootstrap();
}
}
// If remote token has not moved forward, wait for back off time before resending next metadata request
if (remoteReplicaInfo.getToken().equals(exchangeMetadataResponse.remoteToken)) {
remoteReplicaInfo.setReEnableReplicationTime(time.milliseconds() + replicationConfig.replicationSyncedReplicaBackoffDurationMs);
syncedBackOffCount.inc();
}
// There are no missing keys. We just advance the token
if (exchangeMetadataResponse.missingStoreMessages.size() == 0) {
remoteReplicaInfo.setToken(exchangeMetadataResponse.remoteToken);
remoteReplicaInfo.setLocalLagFromRemoteInBytes(exchangeMetadataResponse.localLagFromRemoteInBytes);
logger.trace("Remote node: {} Thread name: {} Remote replica: {} Token after speaking to remote node: {}", remoteNode, threadName, remoteReplicaInfo.getReplicaId(), exchangeMetadataResponse.remoteToken);
}
replicationMetrics.updateLagMetricForRemoteReplica(remoteReplicaInfo, exchangeMetadataResponse.localLagFromRemoteInBytes);
if (replicaMetadataResponseInfo.getMessageInfoList().size() > 0) {
replicationMetrics.updateCatchupPointMetricForCloudReplica(remoteReplicaInfo, replicaMetadataResponseInfo.getMessageInfoList().get(replicaMetadataResponseInfo.getMessageInfoList().size() - 1).getOperationTimeMs());
}
// Add exchangeMetadataResponse to list at the end after operations such as replicaSyncUpManager(if not null)
// has completed update, etc. The reason is we may get exceptions in between (for ex: replicaSyncUpManager may
// throw exception) and end up adding one more exchangeMetadataResponse associated with same RemoteReplicaInfo.
exchangeMetadataResponseList.add(exchangeMetadataResponse);
} catch (Exception e) {
if (e instanceof StoreException && ((StoreException) e).getErrorCode() == StoreErrorCodes.Store_Not_Started) {
// Must have just been stopped, just skip it and move on.
logger.info("Local store not started for remote replica: {}", remoteReplicaInfo.getReplicaId());
exchangeMetadataResponseList.add(new ExchangeMetadataResponse(ServerErrorCode.Temporarily_Disabled));
} else {
logger.error("Remote node: {} Thread name: {} Remote replica: {}", remoteNode, threadName, remoteReplicaInfo.getReplicaId(), e);
replicationMetrics.updateLocalStoreError(remoteReplicaInfo.getReplicaId());
responseHandler.onEvent(remoteReplicaInfo.getReplicaId(), e);
exchangeMetadataResponseList.add(new ExchangeMetadataResponse(ServerErrorCode.Unknown_Error));
}
}
}
} else {
replicationMetrics.updateMetadataRequestError(remoteReplicaInfo.getReplicaId());
logger.error("Remote node: {} Thread name: {} Remote replica: {} Server error: {}", remoteNode, threadName, remoteReplicaInfo.getReplicaId(), replicaMetadataResponseInfo.getError());
exchangeMetadataResponseList.add(new ExchangeMetadataResponse(replicaMetadataResponseInfo.getError()));
}
if (replicatingFromRemoteColo && leaderBasedReplicationAdmin != null) {
ExchangeMetadataResponse exchangeMetadataResponse = exchangeMetadataResponseList.get(i);
if (exchangeMetadataResponse.serverErrorCode.equals(ServerErrorCode.No_Error)) {
// If leader-based replication is enabled, store the meta data exchange received for the remote replica as
// standby replicas will not send GET request for the missing store keys and track them from leader <->
// leader exchanges and intra-dc replication.
remoteReplicaInfo.setExchangeMetadataResponse(new ExchangeMetadataResponse(exchangeMetadataResponse));
// It is possible that some of the missing keys found in exchange metadata response are written in parallel
// by other replica threads since the time we calculated it. Go through the local store once more and
// update missing keys set stored in the exchangeMetadataResponse for the remote replica.
refreshMissingStoreMessagesForStandbyReplica(remoteReplicaInfo);
}
}
}
long processMetadataResponseTimeInMs = time.milliseconds() - startTimeInMs;
logger.trace("Remote node: {} Thread name: {} processMetadataResponseTime: {}", remoteNode, threadName, processMetadataResponseTimeInMs);
} finally {
long exchangeMetadataTime = time.milliseconds() - exchangeMetadataStartTimeInMs;
replicationMetrics.updateExchangeMetadataTime(exchangeMetadataTime, replicatingFromRemoteColo, replicatingOverSsl, datacenterName);
}
}
return exchangeMetadataResponseList;
}
use of com.github.ambry.clustermap.DataNodeId in project ambry by linkedin.
the class ReplicaThread method applyUndelete.
/**
* Applies an undelete to the blob described by {@code messageInfo}.
* @param messageInfo the {@link MessageInfo} that will be transformed into an undelete
* @param remoteReplicaInfo The remote replica that is being replicated from
* @throws StoreException
*/
private void applyUndelete(MessageInfo messageInfo, RemoteReplicaInfo remoteReplicaInfo) throws StoreException {
DataNodeId remoteNode = remoteReplicaInfo.getReplicaId().getDataNodeId();
try {
messageInfo = new MessageInfo.Builder(messageInfo).isUndeleted(true).isDeleted(false).build();
remoteReplicaInfo.getLocalStore().undelete(messageInfo);
logger.trace("Remote node: {} Thread name: {} Remote replica: {} Key undelete id: {}", remoteNode, threadName, remoteReplicaInfo.getReplicaId(), messageInfo.getStoreKey());
} catch (StoreException e) {
// The blob may be undeleted, which is alright
if (e.getErrorCode() == StoreErrorCodes.Life_Version_Conflict || e.getErrorCode() == StoreErrorCodes.ID_Undeleted) {
logger.trace("Remote node: {} Thread name: {} Remote replica: {} Key {}: {}", remoteNode, threadName, remoteReplicaInfo.getReplicaId(), messageInfo.getStoreKey(), e.getErrorCode().name());
} else {
throw e;
}
}
// as long as the undelete is guaranteed to have taken effect locally.
if (notification != null) {
notification.onBlobReplicaUndeleted(dataNodeId.getHostname(), dataNodeId.getPort(), messageInfo.getStoreKey().getID(), BlobReplicaSourceType.REPAIRED);
}
}
use of com.github.ambry.clustermap.DataNodeId in project ambry by linkedin.
the class ReplicaThread method removeRemoteReplicaInfo.
/**
* Remove {@link RemoteReplicaInfo} from current {@link ReplicaThread}.
* @param remoteReplicaInfo {@link RemoteReplicaInfo} to remove.
*/
void removeRemoteReplicaInfo(RemoteReplicaInfo remoteReplicaInfo) {
lock.lock();
try {
DataNodeId dataNodeId = remoteReplicaInfo.getReplicaId().getDataNodeId();
Set<RemoteReplicaInfo> remoteReplicaInfos = replicasToReplicateGroupedByNode.get(dataNodeId);
if (remoteReplicaInfos != null) {
if (!remoteReplicaInfos.remove(remoteReplicaInfo)) {
replicationMetrics.remoteReplicaInfoRemoveError.inc();
logger.error("ReplicaThread: {}, RemoteReplicaInfo {} not found.", threadName, remoteReplicaInfo);
}
} else {
replicationMetrics.remoteReplicaInfoRemoveError.inc();
logger.error("ReplicaThread: {}, RemoteReplicaInfos Set is not created for DataNode {}, RemoteReplicaInfo: {}.", threadName, dataNodeId, remoteReplicaInfo);
}
} finally {
lock.unlock();
}
logger.trace("RemoteReplicaInfo {} is removed from ReplicaThread {}.", remoteReplicaInfo, threadName);
}
use of com.github.ambry.clustermap.DataNodeId in project ambry by linkedin.
the class ReplicaThread method replicate.
/**
* Do replication for replicas grouped by {@link DataNodeId}
* A replication cycle between two replicas involves the following steps:
* 1. Exchange metadata : fetch the metadata of blobs added to remote replica since the last synchronization point
* and filter the ones missing in local store.
* 2. Fetch missing blobs: fetch the missing blobs by issuing GET request to remote replica and write them to
* the local store
*
* During cross-colo replication, depending on the {@link ReplicationModelType}, the missing blobs are either fetched
* from all remote replicas (if modelType == ALL_TO_ALL) or only fetched for local leader replicas from their remote
* leader replicas (if modelType == LEADER_BASED). In the latter case, non-leader replica pairs (leader <-> standby,
* standby <-> leader, standby <-> standby) will get their missing blobs from their corresponding leader<->leader
* exchanges and intra-dc replication.
*
* Here is a table listing on what is exchanged between local and remote replicas based on their roles
* (leader/standby) when {@link ReplicationModelType is LEADER_BASED}.
*
* | Local Leader | Local Standby | Remote Leader | Remote Standby
* -------------------------------------------------------------------------------------
* Leader: | --- | metadata and data | metadata and data | metadata only
* Standby: | metadata and data | metadata and data | metadata only | metadata only
*/
public void replicate() {
boolean allCaughtUp = true;
Map<DataNodeId, List<RemoteReplicaInfo>> dataNodeToRemoteReplicaInfo = getRemoteReplicaInfos();
logger.trace("Replicating from {} DataNodes.", replicasToReplicateGroupedByNode.size());
for (Map.Entry<DataNodeId, List<RemoteReplicaInfo>> entry : dataNodeToRemoteReplicaInfo.entrySet()) {
DataNodeId remoteNode = entry.getKey();
if (!running) {
break;
}
List<RemoteReplicaInfo> replicasToReplicatePerNode = entry.getValue();
Timer.Context context = null;
Timer.Context portTypeBasedContext = null;
if (replicatingFromRemoteColo) {
context = replicationMetrics.interColoReplicationLatency.get(remoteNode.getDatacenterName()).time();
if (replicatingOverSsl) {
portTypeBasedContext = replicationMetrics.sslInterColoReplicationLatency.get(remoteNode.getDatacenterName()).time();
} else {
portTypeBasedContext = replicationMetrics.plainTextInterColoReplicationLatency.get(remoteNode.getDatacenterName()).time();
}
} else {
context = replicationMetrics.intraColoReplicationLatency.time();
if (replicatingOverSsl) {
portTypeBasedContext = replicationMetrics.sslIntraColoReplicationLatency.time();
} else {
portTypeBasedContext = replicationMetrics.plainTextIntraColoReplicationLatency.time();
}
}
ConnectedChannel connectedChannel = null;
long checkoutConnectionTimeInMs = -1;
long exchangeMetadataTimeInMs = -1;
long fixMissingStoreKeysTimeInMs = -1;
long replicationStartTimeInMs = time.milliseconds();
long startTimeInMs = replicationStartTimeInMs;
// Get a list of active replicas that needs be included for this replication cycle
List<RemoteReplicaInfo> activeReplicasPerNode = new ArrayList<>();
List<RemoteReplicaInfo> standbyReplicasWithNoProgress = new ArrayList<>();
for (RemoteReplicaInfo remoteReplicaInfo : replicasToReplicatePerNode) {
ReplicaId replicaId = remoteReplicaInfo.getReplicaId();
boolean inBackoff = time.milliseconds() < remoteReplicaInfo.getReEnableReplicationTime();
if (replicaId.isDown() || inBackoff || remoteReplicaInfo.getLocalStore().getCurrentState() == ReplicaState.OFFLINE || replicationDisabledPartitions.contains(replicaId.getPartitionId())) {
logger.debug("Skipping replication on replica {} because one of following conditions is true: remote replica is down " + "= {}; in backoff = {}; local store is offline = {}; replication is disabled = {}.", replicaId.getPartitionId().toPathString(), replicaId.isDown(), inBackoff, remoteReplicaInfo.getLocalStore().getCurrentState() == ReplicaState.OFFLINE, replicationDisabledPartitions.contains(replicaId.getPartitionId()));
continue;
}
if (replicatingFromRemoteColo && leaderBasedReplicationAdmin != null) {
// check if all missing keys for standby replicas from previous replication cycle are now obtained
// via leader replica. If we still have missing keys, don't include them in current replication cycle
// to avoid sending duplicate metadata requests since their token wouldn't have advanced.
processMissingKeysFromPreviousMetadataResponse(remoteReplicaInfo);
if (containsMissingKeysFromPreviousMetadataExchange(remoteReplicaInfo)) {
standbyReplicasWithNoProgress.add(remoteReplicaInfo);
continue;
}
}
activeReplicasPerNode.add(remoteReplicaInfo);
}
logger.trace("Replicating from {} RemoteReplicaInfos.", activeReplicasPerNode.size());
// use a variable to track current replica list to replicate (for logging purpose)
List<RemoteReplicaInfo> currentReplicaList = activeReplicasPerNode;
try {
if (activeReplicasPerNode.size() > 0) {
allCaughtUp = false;
// if maxReplicaCountPerRequest > 0, split remote replicas on same node into multiple lists; otherwise there is
// no limit.
List<List<RemoteReplicaInfo>> activeReplicaSubLists = maxReplicaCountPerRequest > 0 ? Utils.partitionList(activeReplicasPerNode, maxReplicaCountPerRequest) : Collections.singletonList(activeReplicasPerNode);
startTimeInMs = time.milliseconds();
connectedChannel = connectionPool.checkOutConnection(remoteNode.getHostname(), activeReplicasPerNode.get(0).getPort(), replicationConfig.replicationConnectionPoolCheckoutTimeoutMs);
checkoutConnectionTimeInMs = time.milliseconds() - startTimeInMs;
// we checkout ConnectedChannel once and replicate remote replicas in batch via same ConnectedChannel
for (List<RemoteReplicaInfo> replicaSubList : activeReplicaSubLists) {
exchangeMetadataTimeInMs = -1;
fixMissingStoreKeysTimeInMs = -1;
currentReplicaList = replicaSubList;
logger.debug("Exchanging metadata with {} remote replicas on {}", currentReplicaList.size(), remoteNode);
startTimeInMs = time.milliseconds();
List<ExchangeMetadataResponse> exchangeMetadataResponseList = exchangeMetadata(connectedChannel, replicaSubList);
exchangeMetadataTimeInMs = time.milliseconds() - startTimeInMs;
if (replicatingFromRemoteColo && leaderBasedReplicationAdmin != null) {
// If leader based replication is enabled and we are replicating from remote colo, fetch the missing blobs
// only for local leader replicas from their corresponding peer leader replicas (Leader <-> Leader).
// Non-leader replica pairs (standby <-> leaders, leader <-> standby, standby <-> standby) will get their
// missing blobs from their leader pair exchanges and intra-dc replication.
List<RemoteReplicaInfo> leaderReplicaList = new ArrayList<>();
List<ExchangeMetadataResponse> exchangeMetadataResponseListForLeaderReplicas = new ArrayList<>();
getLeaderReplicaList(replicaSubList, exchangeMetadataResponseList, leaderReplicaList, exchangeMetadataResponseListForLeaderReplicas);
replicaSubList = leaderReplicaList;
exchangeMetadataResponseList = exchangeMetadataResponseListForLeaderReplicas;
}
if (replicaSubList.size() > 0) {
startTimeInMs = time.milliseconds();
fixMissingStoreKeys(connectedChannel, replicaSubList, exchangeMetadataResponseList, false);
fixMissingStoreKeysTimeInMs = time.milliseconds() - startTimeInMs;
}
}
}
if (replicatingFromRemoteColo && leaderBasedReplicationAdmin != null) {
// Get a list of blocked standby replicas whose missing keys haven't arrived for long time.
// Use case: In leader-based cross colo replication, standby replicas don't send GET requests for missing keys
// found in metadata exchange and expect them to come via leader <-> leader replication.
// This is a safety condition to ensure that standby replicas are not stuck waiting for the keys to come from leader
// by fetching the missing keys themselves.
// TODO: As an improvement to this, we can first fetch missing blobs from local leader/other replicas in intra-dc first.
// TODO: If the result to fetch a blob from local dc is Blob_Not_Found, then we can fetch it from replicas in remote datacenter.
// This will involve co-ordination between replica threads containing replicas of same partition.
List<RemoteReplicaInfo> standbyReplicasTimedOutOnNoProgress = getRemoteStandbyReplicasTimedOutOnNoProgress(standbyReplicasWithNoProgress);
if (standbyReplicasTimedOutOnNoProgress.size() > 0) {
allCaughtUp = false;
currentReplicaList = standbyReplicasTimedOutOnNoProgress;
if (connectedChannel == null) {
checkoutConnectionTimeInMs = -1;
startTimeInMs = time.milliseconds();
connectedChannel = connectionPool.checkOutConnection(remoteNode.getHostname(), standbyReplicasTimedOutOnNoProgress.get(0).getPort(), replicationConfig.replicationConnectionPoolCheckoutTimeoutMs);
checkoutConnectionTimeInMs = time.milliseconds() - startTimeInMs;
}
List<ExchangeMetadataResponse> exchangeMetadataResponseListForBlockedReplicas = standbyReplicasTimedOutOnNoProgress.stream().map(remoteReplicaInfo -> new ExchangeMetadataResponse(remoteReplicaInfo.getExchangeMetadataResponse())).collect(Collectors.toList());
// Convert (and cache) the remote keys that are being fetched as the StoreKeyConverter would have cleared
// these keys from its cache while it is replicating with other replicas before time out happened for these standby replicas.
List<StoreKey> storeKeysToConvert = exchangeMetadataResponseListForBlockedReplicas.stream().map(ExchangeMetadataResponse::getMissingStoreKeys).flatMap(Collection::stream).collect(Collectors.toList());
convertStoreKeys(storeKeysToConvert);
exchangeMetadataTimeInMs = 0;
fixMissingStoreKeysTimeInMs = -1;
logger.debug("Sending GET request to fetch missing keys for standby remote replicas {} timed out on no progress", currentReplicaList);
startTimeInMs = time.milliseconds();
fixMissingStoreKeys(connectedChannel, standbyReplicasTimedOutOnNoProgress, exchangeMetadataResponseListForBlockedReplicas, true);
fixMissingStoreKeysTimeInMs = time.milliseconds() - startTimeInMs;
}
}
} catch (Throwable e) {
if (checkoutConnectionTimeInMs == -1) {
// throwable happened in checkout connection phase
checkoutConnectionTimeInMs = time.milliseconds() - startTimeInMs;
responseHandler.onEvent(currentReplicaList.get(0).getReplicaId(), e);
} else if (exchangeMetadataTimeInMs == -1) {
// throwable happened in exchange metadata phase
exchangeMetadataTimeInMs = time.milliseconds() - startTimeInMs;
} else if (fixMissingStoreKeysTimeInMs == -1) {
// throwable happened in fix missing store phase
fixMissingStoreKeysTimeInMs = time.milliseconds() - startTimeInMs;
}
logger.error("Error while talking to peer: Remote node: {}, Thread name: {}, Remote replicas: {}, Current active " + "remote replica list: {}, Checkout connection time: {}, Exchange metadata time: {}, Fix missing " + "store key time {}", remoteNode, threadName, replicasToReplicatePerNode, currentReplicaList, checkoutConnectionTimeInMs, exchangeMetadataTimeInMs, fixMissingStoreKeysTimeInMs, e);
replicationMetrics.incrementReplicationErrors(replicatingOverSsl);
if (connectedChannel != null) {
connectionPool.destroyConnection(connectedChannel);
connectedChannel = null;
}
} finally {
long totalReplicationTime = time.milliseconds() - replicationStartTimeInMs;
replicationMetrics.updateTotalReplicationTime(totalReplicationTime, replicatingFromRemoteColo, replicatingOverSsl, datacenterName);
if (connectedChannel != null) {
connectionPool.checkInConnection(connectedChannel);
}
context.stop();
portTypeBasedContext.stop();
}
}
long sleepDurationMs = 0;
if (allCaughtUp && replicationConfig.replicationReplicaThreadIdleSleepDurationMs > 0) {
sleepDurationMs = replicationConfig.replicationReplicaThreadIdleSleepDurationMs;
idleCount.inc();
} else if (threadThrottleDurationMs > 0) {
sleepDurationMs = threadThrottleDurationMs;
throttleCount.inc();
}
if (sleepDurationMs > 0) {
try {
long currentTime = time.milliseconds();
time.sleep(sleepDurationMs);
logger.trace("Replica thread: {} slept for {} ms", threadName, time.milliseconds() - currentTime);
} catch (InterruptedException e) {
logger.error("Received interrupted exception during throttling", e);
}
}
}
Aggregations