Search in sources :

Example 11 with DataNodeId

use of com.github.ambry.clustermap.DataNodeId in project ambry by linkedin.

the class Http2NetworkClient method warmUpConnections.

@Override
public int warmUpConnections(List<DataNodeId> dataNodeIds, int connectionWarmUpPercentagePerDataNode, long timeForWarmUp, List<ResponseInfo> responseInfoList) {
    long startTime = System.currentTimeMillis();
    AtomicInteger successCount = new AtomicInteger();
    AtomicInteger failCount = new AtomicInteger();
    int warmUpConnectionPerPort = http2ClientConfig.http2MinConnectionPerPort * connectionWarmUpPercentagePerDataNode / 100;
    int expectedConnections = dataNodeIds.size() * warmUpConnectionPerPort;
    for (DataNodeId dataNodeId : dataNodeIds) {
        for (int i = 0; i < warmUpConnectionPerPort; i++) {
            this.pools.get(InetSocketAddress.createUnresolved(dataNodeId.getHostname(), dataNodeId.getHttp2Port())).acquire().addListener((GenericFutureListener<Future<Channel>>) future -> {
                if (future.isSuccess()) {
                    Channel streamChannel = future.getNow();
                    releaseAndCloseStreamChannel(streamChannel);
                    successCount.incrementAndGet();
                } else {
                    failCount.incrementAndGet();
                    responseInfoList.add(new ResponseInfo(null, NetworkClientErrorCode.NetworkError, null, dataNodeId));
                    logger.error("Couldn't acquire stream channel to {}:{} . Cause: {}.", dataNodeId.getHostname(), dataNodeId.getHttp2Port(), future.cause());
                }
            });
        }
    }
    while (System.currentTimeMillis() - startTime < timeForWarmUp) {
        if (successCount.get() + failCount.get() == expectedConnections) {
            break;
        } else {
            try {
                Thread.sleep(300);
            } catch (InterruptedException e) {
                break;
            }
        }
    }
    logger.info("HTTP2 connection warm up done. Tried: {}, Succeeded: {}, Failed: {}, Time elapsed: {} ms", expectedConnections, successCount, failCount, System.currentTimeMillis() - startTime);
    return successCount.get();
}
Also used : ResponseInfo(com.github.ambry.network.ResponseInfo) AttributeKey(io.netty.util.AttributeKey) Http2ClientConfig(com.github.ambry.config.Http2ClientConfig) DataNodeId(com.github.ambry.clustermap.DataNodeId) LoggerFactory(org.slf4j.LoggerFactory) ArrayList(java.util.ArrayList) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) NetworkClientErrorCode(com.github.ambry.network.NetworkClientErrorCode) Http2StreamFrameToHttpObjectCodec(io.netty.handler.codec.http2.Http2StreamFrameToHttpObjectCodec) ChannelFutureListener(io.netty.channel.ChannelFutureListener) Map(java.util.Map) Http2Utils(com.github.ambry.network.http2.Http2Utils) EventLoopGroup(io.netty.channel.EventLoopGroup) Logger(org.slf4j.Logger) SSLFactory(com.github.ambry.commons.SSLFactory) ChannelInitializer(io.netty.channel.ChannelInitializer) NetworkClient(com.github.ambry.network.NetworkClient) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) GenericFutureListener(io.netty.util.concurrent.GenericFutureListener) Set(java.util.Set) InetSocketAddress(java.net.InetSocketAddress) RequestInfo(com.github.ambry.network.RequestInfo) ChannelFuture(io.netty.channel.ChannelFuture) Channel(io.netty.channel.Channel) List(java.util.List) ChannelPool(io.netty.channel.pool.ChannelPool) ChannelPoolMap(io.netty.channel.pool.ChannelPoolMap) Future(io.netty.util.concurrent.Future) HttpObjectAggregator(io.netty.handler.codec.http.HttpObjectAggregator) ResponseInfo(com.github.ambry.network.ResponseInfo) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) Channel(io.netty.channel.Channel) ChannelFuture(io.netty.channel.ChannelFuture) Future(io.netty.util.concurrent.Future) DataNodeId(com.github.ambry.clustermap.DataNodeId)

Example 12 with DataNodeId

use of com.github.ambry.clustermap.DataNodeId in project ambry by linkedin.

the class ReplicaThread method exchangeMetadata.

/**
 * Gets all the metadata about messages from the remote replicas since last token. Checks the messages with the local
 * store and finds all the messages that are missing. For the messages that are not missing, updates the delete
 * and ttl state.
 * @param connectedChannel The connected channel that represents a connection to the remote replica
 * @param replicasToReplicatePerNode The information about the replicas that is being replicated
 * @return - List of ExchangeMetadataResponse that contains the set of store keys that are missing from the local
 *           store and are present in the remote replicas and also the new token from the remote replicas
 * @throws IOException
 * @throws ReplicationException
 */
List<ExchangeMetadataResponse> exchangeMetadata(ConnectedChannel connectedChannel, List<RemoteReplicaInfo> replicasToReplicatePerNode) throws IOException, ReplicationException {
    long exchangeMetadataStartTimeInMs = time.milliseconds();
    List<ExchangeMetadataResponse> exchangeMetadataResponseList = new ArrayList<>();
    if (replicasToReplicatePerNode.size() > 0) {
        try {
            DataNodeId remoteNode = replicasToReplicatePerNode.get(0).getReplicaId().getDataNodeId();
            ReplicaMetadataResponse response = getReplicaMetadataResponse(replicasToReplicatePerNode, connectedChannel, remoteNode);
            long startTimeInMs = time.milliseconds();
            Map<StoreKey, StoreKey> remoteKeyToLocalKeyMap = batchConvertReplicaMetadataResponseKeys(response);
            for (int i = 0; i < response.getReplicaMetadataResponseInfoList().size(); i++) {
                RemoteReplicaInfo remoteReplicaInfo = replicasToReplicatePerNode.get(i);
                ReplicaMetadataResponseInfo replicaMetadataResponseInfo = response.getReplicaMetadataResponseInfoList().get(i);
                responseHandler.onEvent(remoteReplicaInfo.getReplicaId(), replicaMetadataResponseInfo.getError());
                if (replicaMetadataResponseInfo.getError() == ServerErrorCode.No_Error) {
                    // Skip stores that were stopped during call to getReplicaMetadataResponse
                    if (!remoteReplicaInfo.getLocalStore().isStarted()) {
                        exchangeMetadataResponseList.add(new ExchangeMetadataResponse(ServerErrorCode.Temporarily_Disabled));
                    } else {
                        try {
                            logger.trace("Remote node: {} Thread name: {} Remote replica: {} Token from remote: {} Replica lag: {} ", remoteNode, threadName, remoteReplicaInfo.getReplicaId(), replicaMetadataResponseInfo.getFindToken(), replicaMetadataResponseInfo.getRemoteReplicaLagInBytes());
                            Set<MessageInfo> remoteMissingStoreMessages = getMissingStoreMessages(replicaMetadataResponseInfo, remoteNode, remoteReplicaInfo);
                            processReplicaMetadataResponse(remoteMissingStoreMessages, replicaMetadataResponseInfo, remoteReplicaInfo, remoteNode, remoteKeyToLocalKeyMap);
                            // Get the converted keys for the missing keys of this replica (to store them along with missing keys in
                            // the exchange metadata response). For leader based replication, these are used during processing
                            // of missing keys for non-leader replica pairs which will come later via leader<->leader replication.
                            Map<StoreKey, StoreKey> remoteKeyToLocalKeySubMap = new HashMap<>();
                            remoteMissingStoreMessages.forEach(remoteMissingStoreMessage -> {
                                StoreKey remoteKey = remoteMissingStoreMessage.getStoreKey();
                                remoteKeyToLocalKeySubMap.put(remoteKey, remoteKeyToLocalKeyMap.get(remoteKey));
                            });
                            ExchangeMetadataResponse exchangeMetadataResponse = new ExchangeMetadataResponse(remoteMissingStoreMessages, replicaMetadataResponseInfo.getFindToken(), replicaMetadataResponseInfo.getRemoteReplicaLagInBytes(), remoteKeyToLocalKeySubMap, time);
                            // update replication lag in ReplicaSyncUpManager
                            if (replicaSyncUpManager != null && remoteReplicaInfo.getLocalStore().getCurrentState() == ReplicaState.BOOTSTRAP) {
                                ReplicaId localReplica = remoteReplicaInfo.getLocalReplicaId();
                                ReplicaId remoteReplica = remoteReplicaInfo.getReplicaId();
                                boolean isSyncCompleted = replicaSyncUpManager.updateReplicaLagAndCheckSyncStatus(localReplica, remoteReplica, exchangeMetadataResponse.localLagFromRemoteInBytes, ReplicaState.STANDBY);
                                // if catchup is completed by this update call, we can complete bootstrap in local store
                                if (isSyncCompleted) {
                                    // complete BOOTSTRAP -> STANDBY transition
                                    remoteReplicaInfo.getLocalStore().setCurrentState(ReplicaState.STANDBY);
                                    remoteReplicaInfo.getLocalStore().completeBootstrap();
                                }
                            }
                            // If remote token has not moved forward, wait for back off time before resending next metadata request
                            if (remoteReplicaInfo.getToken().equals(exchangeMetadataResponse.remoteToken)) {
                                remoteReplicaInfo.setReEnableReplicationTime(time.milliseconds() + replicationConfig.replicationSyncedReplicaBackoffDurationMs);
                                syncedBackOffCount.inc();
                            }
                            // There are no missing keys. We just advance the token
                            if (exchangeMetadataResponse.missingStoreMessages.size() == 0) {
                                remoteReplicaInfo.setToken(exchangeMetadataResponse.remoteToken);
                                remoteReplicaInfo.setLocalLagFromRemoteInBytes(exchangeMetadataResponse.localLagFromRemoteInBytes);
                                logger.trace("Remote node: {} Thread name: {} Remote replica: {} Token after speaking to remote node: {}", remoteNode, threadName, remoteReplicaInfo.getReplicaId(), exchangeMetadataResponse.remoteToken);
                            }
                            replicationMetrics.updateLagMetricForRemoteReplica(remoteReplicaInfo, exchangeMetadataResponse.localLagFromRemoteInBytes);
                            if (replicaMetadataResponseInfo.getMessageInfoList().size() > 0) {
                                replicationMetrics.updateCatchupPointMetricForCloudReplica(remoteReplicaInfo, replicaMetadataResponseInfo.getMessageInfoList().get(replicaMetadataResponseInfo.getMessageInfoList().size() - 1).getOperationTimeMs());
                            }
                            // Add exchangeMetadataResponse to list at the end after operations such as replicaSyncUpManager(if not null)
                            // has completed update, etc. The reason is we may get exceptions in between (for ex: replicaSyncUpManager may
                            // throw exception) and end up adding one more exchangeMetadataResponse associated with same RemoteReplicaInfo.
                            exchangeMetadataResponseList.add(exchangeMetadataResponse);
                        } catch (Exception e) {
                            if (e instanceof StoreException && ((StoreException) e).getErrorCode() == StoreErrorCodes.Store_Not_Started) {
                                // Must have just been stopped, just skip it and move on.
                                logger.info("Local store not started for remote replica: {}", remoteReplicaInfo.getReplicaId());
                                exchangeMetadataResponseList.add(new ExchangeMetadataResponse(ServerErrorCode.Temporarily_Disabled));
                            } else {
                                logger.error("Remote node: {} Thread name: {} Remote replica: {}", remoteNode, threadName, remoteReplicaInfo.getReplicaId(), e);
                                replicationMetrics.updateLocalStoreError(remoteReplicaInfo.getReplicaId());
                                responseHandler.onEvent(remoteReplicaInfo.getReplicaId(), e);
                                exchangeMetadataResponseList.add(new ExchangeMetadataResponse(ServerErrorCode.Unknown_Error));
                            }
                        }
                    }
                } else {
                    replicationMetrics.updateMetadataRequestError(remoteReplicaInfo.getReplicaId());
                    logger.error("Remote node: {} Thread name: {} Remote replica: {} Server error: {}", remoteNode, threadName, remoteReplicaInfo.getReplicaId(), replicaMetadataResponseInfo.getError());
                    exchangeMetadataResponseList.add(new ExchangeMetadataResponse(replicaMetadataResponseInfo.getError()));
                }
                if (replicatingFromRemoteColo && leaderBasedReplicationAdmin != null) {
                    ExchangeMetadataResponse exchangeMetadataResponse = exchangeMetadataResponseList.get(i);
                    if (exchangeMetadataResponse.serverErrorCode.equals(ServerErrorCode.No_Error)) {
                        // If leader-based replication is enabled, store the meta data exchange received for the remote replica as
                        // standby replicas will not send GET request for the missing store keys and track them from leader <->
                        // leader exchanges and intra-dc replication.
                        remoteReplicaInfo.setExchangeMetadataResponse(new ExchangeMetadataResponse(exchangeMetadataResponse));
                        // It is possible that some of the missing keys found in exchange metadata response are written in parallel
                        // by other replica threads since the time we calculated it. Go through the local store once more and
                        // update missing keys set stored in the exchangeMetadataResponse for the remote replica.
                        refreshMissingStoreMessagesForStandbyReplica(remoteReplicaInfo);
                    }
                }
            }
            long processMetadataResponseTimeInMs = time.milliseconds() - startTimeInMs;
            logger.trace("Remote node: {} Thread name: {} processMetadataResponseTime: {}", remoteNode, threadName, processMetadataResponseTimeInMs);
        } finally {
            long exchangeMetadataTime = time.milliseconds() - exchangeMetadataStartTimeInMs;
            replicationMetrics.updateExchangeMetadataTime(exchangeMetadataTime, replicatingFromRemoteColo, replicatingOverSsl, datacenterName);
        }
    }
    return exchangeMetadataResponseList;
}
Also used : ReplicaMetadataResponse(com.github.ambry.protocol.ReplicaMetadataResponse) ReplicaMetadataResponseInfo(com.github.ambry.protocol.ReplicaMetadataResponseInfo) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) StoreKey(com.github.ambry.store.StoreKey) ReplicaId(com.github.ambry.clustermap.ReplicaId) StoreException(com.github.ambry.store.StoreException) IOException(java.io.IOException) MessageInfo(com.github.ambry.store.MessageInfo) StoreException(com.github.ambry.store.StoreException) DataNodeId(com.github.ambry.clustermap.DataNodeId)

Example 13 with DataNodeId

use of com.github.ambry.clustermap.DataNodeId in project ambry by linkedin.

the class ReplicaThread method applyUndelete.

/**
 * Applies an undelete to the blob described by {@code messageInfo}.
 * @param messageInfo the {@link MessageInfo} that will be transformed into an undelete
 * @param remoteReplicaInfo The remote replica that is being replicated from
 * @throws StoreException
 */
private void applyUndelete(MessageInfo messageInfo, RemoteReplicaInfo remoteReplicaInfo) throws StoreException {
    DataNodeId remoteNode = remoteReplicaInfo.getReplicaId().getDataNodeId();
    try {
        messageInfo = new MessageInfo.Builder(messageInfo).isUndeleted(true).isDeleted(false).build();
        remoteReplicaInfo.getLocalStore().undelete(messageInfo);
        logger.trace("Remote node: {} Thread name: {} Remote replica: {} Key undelete id: {}", remoteNode, threadName, remoteReplicaInfo.getReplicaId(), messageInfo.getStoreKey());
    } catch (StoreException e) {
        // The blob may be undeleted, which is alright
        if (e.getErrorCode() == StoreErrorCodes.Life_Version_Conflict || e.getErrorCode() == StoreErrorCodes.ID_Undeleted) {
            logger.trace("Remote node: {} Thread name: {} Remote replica: {} Key {}: {}", remoteNode, threadName, remoteReplicaInfo.getReplicaId(), messageInfo.getStoreKey(), e.getErrorCode().name());
        } else {
            throw e;
        }
    }
    // as long as the undelete is guaranteed to have taken effect locally.
    if (notification != null) {
        notification.onBlobReplicaUndeleted(dataNodeId.getHostname(), dataNodeId.getPort(), messageInfo.getStoreKey().getID(), BlobReplicaSourceType.REPAIRED);
    }
}
Also used : DataNodeId(com.github.ambry.clustermap.DataNodeId) StoreException(com.github.ambry.store.StoreException)

Example 14 with DataNodeId

use of com.github.ambry.clustermap.DataNodeId in project ambry by linkedin.

the class ReplicaThread method removeRemoteReplicaInfo.

/**
 * Remove {@link RemoteReplicaInfo} from current {@link ReplicaThread}.
 * @param remoteReplicaInfo {@link RemoteReplicaInfo} to remove.
 */
void removeRemoteReplicaInfo(RemoteReplicaInfo remoteReplicaInfo) {
    lock.lock();
    try {
        DataNodeId dataNodeId = remoteReplicaInfo.getReplicaId().getDataNodeId();
        Set<RemoteReplicaInfo> remoteReplicaInfos = replicasToReplicateGroupedByNode.get(dataNodeId);
        if (remoteReplicaInfos != null) {
            if (!remoteReplicaInfos.remove(remoteReplicaInfo)) {
                replicationMetrics.remoteReplicaInfoRemoveError.inc();
                logger.error("ReplicaThread: {}, RemoteReplicaInfo {} not found.", threadName, remoteReplicaInfo);
            }
        } else {
            replicationMetrics.remoteReplicaInfoRemoveError.inc();
            logger.error("ReplicaThread: {}, RemoteReplicaInfos Set is not created for DataNode {}, RemoteReplicaInfo: {}.", threadName, dataNodeId, remoteReplicaInfo);
        }
    } finally {
        lock.unlock();
    }
    logger.trace("RemoteReplicaInfo {} is removed from ReplicaThread {}.", remoteReplicaInfo, threadName);
}
Also used : DataNodeId(com.github.ambry.clustermap.DataNodeId)

Example 15 with DataNodeId

use of com.github.ambry.clustermap.DataNodeId in project ambry by linkedin.

the class ReplicaThread method replicate.

/**
 * Do replication for replicas grouped by {@link DataNodeId}
 * A replication cycle between two replicas involves the following steps:
 *    1. Exchange metadata : fetch the metadata of blobs added to remote replica since the last synchronization point
 *    and filter the ones missing in local store.
 *    2. Fetch missing blobs: fetch the missing blobs by issuing GET request to remote replica and write them to
 *       the local store
 *
 *  During cross-colo replication, depending on the {@link ReplicationModelType}, the missing blobs are either fetched
 *  from all remote replicas (if modelType == ALL_TO_ALL) or only fetched for local leader replicas from their remote
 *  leader replicas (if modelType == LEADER_BASED). In the latter case, non-leader replica pairs (leader <-> standby,
 *  standby <-> leader, standby <-> standby) will get their missing blobs from their corresponding leader<->leader
 *  exchanges and intra-dc replication.
 *
 *  Here is a table listing on what is exchanged between local and remote replicas based on their roles
 *  (leader/standby) when {@link ReplicationModelType is LEADER_BASED}.
 *
 *              |   Local Leader    |     Local Standby   |   Remote Leader   |  Remote Standby
 *            -------------------------------------------------------------------------------------
 *     Leader:  |        ---        |  metadata and data  | metadata and data |   metadata only
 *     Standby: | metadata and data |  metadata and data  | metadata only     |   metadata only
 */
public void replicate() {
    boolean allCaughtUp = true;
    Map<DataNodeId, List<RemoteReplicaInfo>> dataNodeToRemoteReplicaInfo = getRemoteReplicaInfos();
    logger.trace("Replicating from {} DataNodes.", replicasToReplicateGroupedByNode.size());
    for (Map.Entry<DataNodeId, List<RemoteReplicaInfo>> entry : dataNodeToRemoteReplicaInfo.entrySet()) {
        DataNodeId remoteNode = entry.getKey();
        if (!running) {
            break;
        }
        List<RemoteReplicaInfo> replicasToReplicatePerNode = entry.getValue();
        Timer.Context context = null;
        Timer.Context portTypeBasedContext = null;
        if (replicatingFromRemoteColo) {
            context = replicationMetrics.interColoReplicationLatency.get(remoteNode.getDatacenterName()).time();
            if (replicatingOverSsl) {
                portTypeBasedContext = replicationMetrics.sslInterColoReplicationLatency.get(remoteNode.getDatacenterName()).time();
            } else {
                portTypeBasedContext = replicationMetrics.plainTextInterColoReplicationLatency.get(remoteNode.getDatacenterName()).time();
            }
        } else {
            context = replicationMetrics.intraColoReplicationLatency.time();
            if (replicatingOverSsl) {
                portTypeBasedContext = replicationMetrics.sslIntraColoReplicationLatency.time();
            } else {
                portTypeBasedContext = replicationMetrics.plainTextIntraColoReplicationLatency.time();
            }
        }
        ConnectedChannel connectedChannel = null;
        long checkoutConnectionTimeInMs = -1;
        long exchangeMetadataTimeInMs = -1;
        long fixMissingStoreKeysTimeInMs = -1;
        long replicationStartTimeInMs = time.milliseconds();
        long startTimeInMs = replicationStartTimeInMs;
        // Get a list of active replicas that needs be included for this replication cycle
        List<RemoteReplicaInfo> activeReplicasPerNode = new ArrayList<>();
        List<RemoteReplicaInfo> standbyReplicasWithNoProgress = new ArrayList<>();
        for (RemoteReplicaInfo remoteReplicaInfo : replicasToReplicatePerNode) {
            ReplicaId replicaId = remoteReplicaInfo.getReplicaId();
            boolean inBackoff = time.milliseconds() < remoteReplicaInfo.getReEnableReplicationTime();
            if (replicaId.isDown() || inBackoff || remoteReplicaInfo.getLocalStore().getCurrentState() == ReplicaState.OFFLINE || replicationDisabledPartitions.contains(replicaId.getPartitionId())) {
                logger.debug("Skipping replication on replica {} because one of following conditions is true: remote replica is down " + "= {}; in backoff = {}; local store is offline = {}; replication is disabled = {}.", replicaId.getPartitionId().toPathString(), replicaId.isDown(), inBackoff, remoteReplicaInfo.getLocalStore().getCurrentState() == ReplicaState.OFFLINE, replicationDisabledPartitions.contains(replicaId.getPartitionId()));
                continue;
            }
            if (replicatingFromRemoteColo && leaderBasedReplicationAdmin != null) {
                // check if all missing keys for standby replicas from previous replication cycle are now obtained
                // via leader replica. If we still have missing keys, don't include them in current replication cycle
                // to avoid sending duplicate metadata requests since their token wouldn't have advanced.
                processMissingKeysFromPreviousMetadataResponse(remoteReplicaInfo);
                if (containsMissingKeysFromPreviousMetadataExchange(remoteReplicaInfo)) {
                    standbyReplicasWithNoProgress.add(remoteReplicaInfo);
                    continue;
                }
            }
            activeReplicasPerNode.add(remoteReplicaInfo);
        }
        logger.trace("Replicating from {} RemoteReplicaInfos.", activeReplicasPerNode.size());
        // use a variable to track current replica list to replicate (for logging purpose)
        List<RemoteReplicaInfo> currentReplicaList = activeReplicasPerNode;
        try {
            if (activeReplicasPerNode.size() > 0) {
                allCaughtUp = false;
                // if maxReplicaCountPerRequest > 0, split remote replicas on same node into multiple lists; otherwise there is
                // no limit.
                List<List<RemoteReplicaInfo>> activeReplicaSubLists = maxReplicaCountPerRequest > 0 ? Utils.partitionList(activeReplicasPerNode, maxReplicaCountPerRequest) : Collections.singletonList(activeReplicasPerNode);
                startTimeInMs = time.milliseconds();
                connectedChannel = connectionPool.checkOutConnection(remoteNode.getHostname(), activeReplicasPerNode.get(0).getPort(), replicationConfig.replicationConnectionPoolCheckoutTimeoutMs);
                checkoutConnectionTimeInMs = time.milliseconds() - startTimeInMs;
                // we checkout ConnectedChannel once and replicate remote replicas in batch via same ConnectedChannel
                for (List<RemoteReplicaInfo> replicaSubList : activeReplicaSubLists) {
                    exchangeMetadataTimeInMs = -1;
                    fixMissingStoreKeysTimeInMs = -1;
                    currentReplicaList = replicaSubList;
                    logger.debug("Exchanging metadata with {} remote replicas on {}", currentReplicaList.size(), remoteNode);
                    startTimeInMs = time.milliseconds();
                    List<ExchangeMetadataResponse> exchangeMetadataResponseList = exchangeMetadata(connectedChannel, replicaSubList);
                    exchangeMetadataTimeInMs = time.milliseconds() - startTimeInMs;
                    if (replicatingFromRemoteColo && leaderBasedReplicationAdmin != null) {
                        // If leader based replication is enabled and we are replicating from remote colo, fetch the missing blobs
                        // only for local leader replicas from their corresponding peer leader replicas (Leader <-> Leader).
                        // Non-leader replica pairs (standby <-> leaders, leader <-> standby, standby <-> standby) will get their
                        // missing blobs from their leader pair exchanges and intra-dc replication.
                        List<RemoteReplicaInfo> leaderReplicaList = new ArrayList<>();
                        List<ExchangeMetadataResponse> exchangeMetadataResponseListForLeaderReplicas = new ArrayList<>();
                        getLeaderReplicaList(replicaSubList, exchangeMetadataResponseList, leaderReplicaList, exchangeMetadataResponseListForLeaderReplicas);
                        replicaSubList = leaderReplicaList;
                        exchangeMetadataResponseList = exchangeMetadataResponseListForLeaderReplicas;
                    }
                    if (replicaSubList.size() > 0) {
                        startTimeInMs = time.milliseconds();
                        fixMissingStoreKeys(connectedChannel, replicaSubList, exchangeMetadataResponseList, false);
                        fixMissingStoreKeysTimeInMs = time.milliseconds() - startTimeInMs;
                    }
                }
            }
            if (replicatingFromRemoteColo && leaderBasedReplicationAdmin != null) {
                // Get a list of blocked standby replicas whose missing keys haven't arrived for long time.
                // Use case: In leader-based cross colo replication, standby replicas don't send GET requests for missing keys
                // found in metadata exchange and expect them to come via leader <-> leader replication.
                // This is a safety condition to ensure that standby replicas are not stuck waiting for the keys to come from leader
                // by fetching the missing keys themselves.
                // TODO: As an improvement to this, we can first fetch missing blobs from local leader/other replicas in intra-dc first.
                // TODO: If the result to fetch a blob from local dc is Blob_Not_Found, then we can fetch it from replicas in remote datacenter.
                // This will involve co-ordination between replica threads containing replicas of same partition.
                List<RemoteReplicaInfo> standbyReplicasTimedOutOnNoProgress = getRemoteStandbyReplicasTimedOutOnNoProgress(standbyReplicasWithNoProgress);
                if (standbyReplicasTimedOutOnNoProgress.size() > 0) {
                    allCaughtUp = false;
                    currentReplicaList = standbyReplicasTimedOutOnNoProgress;
                    if (connectedChannel == null) {
                        checkoutConnectionTimeInMs = -1;
                        startTimeInMs = time.milliseconds();
                        connectedChannel = connectionPool.checkOutConnection(remoteNode.getHostname(), standbyReplicasTimedOutOnNoProgress.get(0).getPort(), replicationConfig.replicationConnectionPoolCheckoutTimeoutMs);
                        checkoutConnectionTimeInMs = time.milliseconds() - startTimeInMs;
                    }
                    List<ExchangeMetadataResponse> exchangeMetadataResponseListForBlockedReplicas = standbyReplicasTimedOutOnNoProgress.stream().map(remoteReplicaInfo -> new ExchangeMetadataResponse(remoteReplicaInfo.getExchangeMetadataResponse())).collect(Collectors.toList());
                    // Convert (and cache) the remote keys that are being fetched as the StoreKeyConverter would have cleared
                    // these keys from its cache while it is replicating with other replicas before time out happened for these standby replicas.
                    List<StoreKey> storeKeysToConvert = exchangeMetadataResponseListForBlockedReplicas.stream().map(ExchangeMetadataResponse::getMissingStoreKeys).flatMap(Collection::stream).collect(Collectors.toList());
                    convertStoreKeys(storeKeysToConvert);
                    exchangeMetadataTimeInMs = 0;
                    fixMissingStoreKeysTimeInMs = -1;
                    logger.debug("Sending GET request to fetch missing keys for standby remote replicas {} timed out on no progress", currentReplicaList);
                    startTimeInMs = time.milliseconds();
                    fixMissingStoreKeys(connectedChannel, standbyReplicasTimedOutOnNoProgress, exchangeMetadataResponseListForBlockedReplicas, true);
                    fixMissingStoreKeysTimeInMs = time.milliseconds() - startTimeInMs;
                }
            }
        } catch (Throwable e) {
            if (checkoutConnectionTimeInMs == -1) {
                // throwable happened in checkout connection phase
                checkoutConnectionTimeInMs = time.milliseconds() - startTimeInMs;
                responseHandler.onEvent(currentReplicaList.get(0).getReplicaId(), e);
            } else if (exchangeMetadataTimeInMs == -1) {
                // throwable happened in exchange metadata phase
                exchangeMetadataTimeInMs = time.milliseconds() - startTimeInMs;
            } else if (fixMissingStoreKeysTimeInMs == -1) {
                // throwable happened in fix missing store phase
                fixMissingStoreKeysTimeInMs = time.milliseconds() - startTimeInMs;
            }
            logger.error("Error while talking to peer: Remote node: {}, Thread name: {}, Remote replicas: {}, Current active " + "remote replica list: {}, Checkout connection time: {}, Exchange metadata time: {}, Fix missing " + "store key time {}", remoteNode, threadName, replicasToReplicatePerNode, currentReplicaList, checkoutConnectionTimeInMs, exchangeMetadataTimeInMs, fixMissingStoreKeysTimeInMs, e);
            replicationMetrics.incrementReplicationErrors(replicatingOverSsl);
            if (connectedChannel != null) {
                connectionPool.destroyConnection(connectedChannel);
                connectedChannel = null;
            }
        } finally {
            long totalReplicationTime = time.milliseconds() - replicationStartTimeInMs;
            replicationMetrics.updateTotalReplicationTime(totalReplicationTime, replicatingFromRemoteColo, replicatingOverSsl, datacenterName);
            if (connectedChannel != null) {
                connectionPool.checkInConnection(connectedChannel);
            }
            context.stop();
            portTypeBasedContext.stop();
        }
    }
    long sleepDurationMs = 0;
    if (allCaughtUp && replicationConfig.replicationReplicaThreadIdleSleepDurationMs > 0) {
        sleepDurationMs = replicationConfig.replicationReplicaThreadIdleSleepDurationMs;
        idleCount.inc();
    } else if (threadThrottleDurationMs > 0) {
        sleepDurationMs = threadThrottleDurationMs;
        throttleCount.inc();
    }
    if (sleepDurationMs > 0) {
        try {
            long currentTime = time.milliseconds();
            time.sleep(sleepDurationMs);
            logger.trace("Replica thread: {} slept for {} ms", threadName, time.milliseconds() - currentTime);
        } catch (InterruptedException e) {
            logger.error("Received interrupted exception during throttling", e);
        }
    }
}
Also used : GetOption(com.github.ambry.protocol.GetOption) StoreKeyConverter(com.github.ambry.store.StoreKeyConverter) DataNodeId(com.github.ambry.clustermap.DataNodeId) LoggerFactory(org.slf4j.LoggerFactory) MessageFormatWriteSet(com.github.ambry.messageformat.MessageFormatWriteSet) StoreErrorCodes(com.github.ambry.store.StoreErrorCodes) GetResponse(com.github.ambry.protocol.GetResponse) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) Map(java.util.Map) Counter(com.codahale.metrics.Counter) ReplicaMetadataRequest(com.github.ambry.protocol.ReplicaMetadataRequest) GetRequest(com.github.ambry.protocol.GetRequest) ReplicationConfig(com.github.ambry.config.ReplicationConfig) NotificationSystem(com.github.ambry.notification.NotificationSystem) ReplicaSyncUpManager(com.github.ambry.clustermap.ReplicaSyncUpManager) PartitionResponseInfo(com.github.ambry.protocol.PartitionResponseInfo) Predicate(java.util.function.Predicate) Collection(java.util.Collection) Set(java.util.Set) Utils(com.github.ambry.utils.Utils) Collectors(java.util.stream.Collectors) ConnectedChannel(com.github.ambry.network.ConnectedChannel) ReplicaMetadataRequestInfo(com.github.ambry.protocol.ReplicaMetadataRequestInfo) CountDownLatch(java.util.concurrent.CountDownLatch) StoreKey(com.github.ambry.store.StoreKey) List(java.util.List) ReplicaMetadataResponse(com.github.ambry.protocol.ReplicaMetadataResponse) MessageFormatFlags(com.github.ambry.messageformat.MessageFormatFlags) UpdateType(com.github.ambry.notification.UpdateType) Timer(com.codahale.metrics.Timer) MessageSievingInputStream(com.github.ambry.messageformat.MessageSievingInputStream) PartitionId(com.github.ambry.clustermap.PartitionId) BlobId(com.github.ambry.commons.BlobId) ResponseHandler(com.github.ambry.commons.ResponseHandler) PartitionRequestInfo(com.github.ambry.protocol.PartitionRequestInfo) BlobReplicaSourceType(com.github.ambry.notification.BlobReplicaSourceType) ServerErrorCode(com.github.ambry.server.ServerErrorCode) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) NettyByteBufDataInputStream(com.github.ambry.utils.NettyByteBufDataInputStream) HashSet(java.util.HashSet) Transformer(com.github.ambry.store.Transformer) ChannelOutput(com.github.ambry.network.ChannelOutput) StoreException(com.github.ambry.store.StoreException) ReplicaMetadataResponseInfo(com.github.ambry.protocol.ReplicaMetadataResponseInfo) CloudDataNode(com.github.ambry.clustermap.CloudDataNode) Time(com.github.ambry.utils.Time) ReplicaState(com.github.ambry.clustermap.ReplicaState) MetricRegistry(com.codahale.metrics.MetricRegistry) Logger(org.slf4j.Logger) ReentrantLock(java.util.concurrent.locks.ReentrantLock) ConnectionPool(com.github.ambry.network.ConnectionPool) ClusterMap(com.github.ambry.clustermap.ClusterMap) IOException(java.io.IOException) Condition(java.util.concurrent.locks.Condition) MessageInfo(com.github.ambry.store.MessageInfo) ReplicaId(com.github.ambry.clustermap.ReplicaId) BlobStore(com.github.ambry.store.BlobStore) Collections(java.util.Collections) ArrayList(java.util.ArrayList) ConnectedChannel(com.github.ambry.network.ConnectedChannel) StoreKey(com.github.ambry.store.StoreKey) ReplicaId(com.github.ambry.clustermap.ReplicaId) Timer(com.codahale.metrics.Timer) List(java.util.List) ArrayList(java.util.ArrayList) DataNodeId(com.github.ambry.clustermap.DataNodeId) Map(java.util.Map) HashMap(java.util.HashMap) ClusterMap(com.github.ambry.clustermap.ClusterMap)

Aggregations

DataNodeId (com.github.ambry.clustermap.DataNodeId)92 ArrayList (java.util.ArrayList)45 Test (org.junit.Test)45 HashMap (java.util.HashMap)29 PartitionId (com.github.ambry.clustermap.PartitionId)28 MockDataNodeId (com.github.ambry.clustermap.MockDataNodeId)27 ReplicaId (com.github.ambry.clustermap.ReplicaId)25 MockClusterMap (com.github.ambry.clustermap.MockClusterMap)23 VerifiableProperties (com.github.ambry.config.VerifiableProperties)23 MetricRegistry (com.codahale.metrics.MetricRegistry)22 MockPartitionId (com.github.ambry.clustermap.MockPartitionId)22 List (java.util.List)22 Map (java.util.Map)22 Port (com.github.ambry.network.Port)21 ClusterMap (com.github.ambry.clustermap.ClusterMap)20 ClusterMapConfig (com.github.ambry.config.ClusterMapConfig)19 StoreKeyFactory (com.github.ambry.store.StoreKeyFactory)18 BlobIdFactory (com.github.ambry.commons.BlobIdFactory)17 HashSet (java.util.HashSet)16 Properties (java.util.Properties)16