Search in sources :

Example 1 with DataNodeId

use of com.github.ambry.clustermap.DataNodeId in project ambry by linkedin.

the class ReplicaThread method replicate.

/**
 * Replicas from the given replicas
 * @param replicasToReplicate list of {@link RemoteReplicaInfo} by data node
 */
void replicate(List<List<RemoteReplicaInfo>> replicasToReplicate) {
    // shuffle the nodes
    Collections.shuffle(replicasToReplicate);
    for (List<RemoteReplicaInfo> replicasToReplicatePerNode : replicasToReplicate) {
        if (!running) {
            break;
        }
        DataNodeId remoteNode = replicasToReplicatePerNode.get(0).getReplicaId().getDataNodeId();
        logger.trace("Remote node: {} Thread name: {} Remote replicas: {}", remoteNode, threadName, replicasToReplicatePerNode);
        Timer.Context context = null;
        Timer.Context portTypeBasedContext = null;
        if (replicatingFromRemoteColo) {
            context = replicationMetrics.interColoReplicationLatency.get(remoteNode.getDatacenterName()).time();
            if (replicatingOverSsl) {
                portTypeBasedContext = replicationMetrics.sslInterColoReplicationLatency.get(remoteNode.getDatacenterName()).time();
            } else {
                portTypeBasedContext = replicationMetrics.plainTextInterColoReplicationLatency.get(remoteNode.getDatacenterName()).time();
            }
        } else {
            context = replicationMetrics.intraColoReplicationLatency.time();
            if (replicatingOverSsl) {
                portTypeBasedContext = replicationMetrics.sslIntraColoReplicationLatency.time();
            } else {
                portTypeBasedContext = replicationMetrics.plainTextIntraColoReplicationLatency.time();
            }
        }
        ConnectedChannel connectedChannel = null;
        long checkoutConnectionTimeInMs = -1;
        long exchangeMetadataTimeInMs = -1;
        long fixMissingStoreKeysTimeInMs = -1;
        long replicationStartTimeInMs = SystemTime.getInstance().milliseconds();
        long startTimeInMs = replicationStartTimeInMs;
        List<RemoteReplicaInfo> activeReplicasPerNode = new ArrayList<RemoteReplicaInfo>();
        for (RemoteReplicaInfo remoteReplicaInfo : replicasToReplicatePerNode) {
            ReplicaId replicaId = remoteReplicaInfo.getReplicaId();
            if (!replicationDisabledPartitions.contains(replicaId.getPartitionId()) && !replicaId.isDown()) {
                activeReplicasPerNode.add(remoteReplicaInfo);
            }
        }
        if (activeReplicasPerNode.size() > 0) {
            try {
                connectedChannel = connectionPool.checkOutConnection(remoteNode.getHostname(), activeReplicasPerNode.get(0).getPort(), replicationConfig.replicationConnectionPoolCheckoutTimeoutMs);
                checkoutConnectionTimeInMs = SystemTime.getInstance().milliseconds() - startTimeInMs;
                startTimeInMs = SystemTime.getInstance().milliseconds();
                List<ExchangeMetadataResponse> exchangeMetadataResponseList = exchangeMetadata(connectedChannel, activeReplicasPerNode);
                exchangeMetadataTimeInMs = SystemTime.getInstance().milliseconds() - startTimeInMs;
                startTimeInMs = SystemTime.getInstance().milliseconds();
                fixMissingStoreKeys(connectedChannel, activeReplicasPerNode, exchangeMetadataResponseList);
                fixMissingStoreKeysTimeInMs = SystemTime.getInstance().milliseconds() - startTimeInMs;
            } catch (Throwable e) {
                if (checkoutConnectionTimeInMs == -1) {
                    // throwable happened in checkout connection phase
                    checkoutConnectionTimeInMs = SystemTime.getInstance().milliseconds() - startTimeInMs;
                    responseHandler.onEvent(activeReplicasPerNode.get(0).getReplicaId(), e);
                } else if (exchangeMetadataTimeInMs == -1) {
                    // throwable happened in exchange metadata phase
                    exchangeMetadataTimeInMs = SystemTime.getInstance().milliseconds() - startTimeInMs;
                } else if (fixMissingStoreKeysTimeInMs == -1) {
                    // throwable happened in fix missing store phase
                    fixMissingStoreKeysTimeInMs = SystemTime.getInstance().milliseconds() - startTimeInMs;
                }
                logger.error("Error while talking to peer: Remote node: {}, Thread name: {}, Remote replicas: {}, Active " + "remote replicas: {}, Checkout connection time: {}, Exchange metadata time: {}, Fix missing store key " + "time {}", remoteNode, threadName, replicasToReplicatePerNode, activeReplicasPerNode, checkoutConnectionTimeInMs, exchangeMetadataTimeInMs, fixMissingStoreKeysTimeInMs, e);
                replicationMetrics.incrementReplicationErrors(replicatingOverSsl);
                if (connectedChannel != null) {
                    connectionPool.destroyConnection(connectedChannel);
                    connectedChannel = null;
                }
            } finally {
                long totalReplicationTime = SystemTime.getInstance().milliseconds() - replicationStartTimeInMs;
                replicationMetrics.updateTotalReplicationTime(totalReplicationTime, replicatingFromRemoteColo, replicatingOverSsl, datacenterName);
                if (connectedChannel != null) {
                    connectionPool.checkInConnection(connectedChannel);
                }
                context.stop();
                portTypeBasedContext.stop();
            }
        }
    }
}
Also used : Timer(com.codahale.metrics.Timer) ArrayList(java.util.ArrayList) ConnectedChannel(com.github.ambry.network.ConnectedChannel) DataNodeId(com.github.ambry.clustermap.DataNodeId) ReplicaId(com.github.ambry.clustermap.ReplicaId)

Example 2 with DataNodeId

use of com.github.ambry.clustermap.DataNodeId in project ambry by linkedin.

the class ReplicationMetrics method addRemoteReplicaToLagMetrics.

public void addRemoteReplicaToLagMetrics(final RemoteReplicaInfo remoteReplicaInfo) {
    ReplicaId replicaId = remoteReplicaInfo.getReplicaId();
    DataNodeId dataNodeId = replicaId.getDataNodeId();
    final String metricName = dataNodeId.getHostname() + "-" + dataNodeId.getPort() + "-" + replicaId.getPartitionId() + "-replicaLagInBytes";
    Gauge<Long> replicaLag = new Gauge<Long>() {

        @Override
        public Long getValue() {
            return remoteReplicaInfo.getRemoteLagFromLocalInBytes();
        }
    };
    registry.register(MetricRegistry.name(ReplicationMetrics.class, metricName), replicaLag);
    replicaLagInBytes.add(replicaLag);
}
Also used : DataNodeId(com.github.ambry.clustermap.DataNodeId) ReplicaId(com.github.ambry.clustermap.ReplicaId) Gauge(com.codahale.metrics.Gauge)

Example 3 with DataNodeId

use of com.github.ambry.clustermap.DataNodeId in project ambry by linkedin.

the class BlobValidator method getMismatchDetails.

/**
 * Compares {@link ServerResponse} for a list of replicas.
 * @param blobId the blobId for which the comparison is made
 * @param dataNodeIdBlobContentMap the map containing the replica to their respective {@link ServerResponse}
 * @return a list of details if there are mismatches. Zero sized list if there aren't any mismatches.
 */
private List<String> getMismatchDetails(String blobId, Map<DataNodeId, ServerResponse> dataNodeIdBlobContentMap) {
    List<String> mismatchDetailsList = new ArrayList<>();
    Iterator<DataNodeId> dataNodeIdIterator = dataNodeIdBlobContentMap.keySet().iterator();
    DataNodeId dataNodeId1 = dataNodeIdIterator.next();
    ServerResponse dataNode1ServerResponse = dataNodeIdBlobContentMap.get(dataNodeId1);
    while (dataNodeIdIterator.hasNext()) {
        DataNodeId dataNodeId2 = dataNodeIdIterator.next();
        ServerResponse dataNode2ServerResponse = dataNodeIdBlobContentMap.get(dataNodeId2);
        String mismatchDetails = dataNode1ServerResponse.getMismatchDetails(dataNode2ServerResponse);
        if (mismatchDetails != null) {
            mismatchDetails = "Mismatch for [" + blobId + "] between [" + dataNodeId1 + "] and [" + dataNodeId2 + "] - " + mismatchDetails;
            mismatchDetailsList.add(mismatchDetails);
        }
    }
    return mismatchDetailsList;
}
Also used : ArrayList(java.util.ArrayList) DataNodeId(com.github.ambry.clustermap.DataNodeId)

Example 4 with DataNodeId

use of com.github.ambry.clustermap.DataNodeId in project ambry by linkedin.

the class BlobValidator method validateBlobOnAllReplicas.

/**
 * Validates {@code blobId} on all of its replicas.
 * @param blobId the {@link BlobId} to operate on.
 * @param getOption the {@link GetOption} to use with the {@link com.github.ambry.protocol.GetRequest}.
 * @param clusterMap the {@link ClusterMap} instance to use.
 * @param storeKeyFactory the {@link StoreKeyFactory} to use.
 * @return a list of details if there are mismatches. Zero sized list if there aren't any mismatches.
 * @throws InterruptedException
 */
private List<String> validateBlobOnAllReplicas(BlobId blobId, GetOption getOption, ClusterMap clusterMap, StoreKeyFactory storeKeyFactory) throws InterruptedException {
    Map<DataNodeId, ServerResponse> dataNodeIdBlobContentMap = new HashMap<>();
    for (ReplicaId replicaId : blobId.getPartition().getReplicaIds()) {
        ServerResponse response = getRecordFromNode(replicaId.getDataNodeId(), blobId, getOption, clusterMap, storeKeyFactory);
        dataNodeIdBlobContentMap.put(replicaId.getDataNodeId(), response);
    }
    return getMismatchDetails(blobId.getID(), dataNodeIdBlobContentMap);
}
Also used : HashMap(java.util.HashMap) DataNodeId(com.github.ambry.clustermap.DataNodeId) ReplicaId(com.github.ambry.clustermap.ReplicaId)

Example 5 with DataNodeId

use of com.github.ambry.clustermap.DataNodeId in project ambry by linkedin.

the class ServerAdminTool method main.

/**
 * Runs the server admin tool
 * @param args associated arguments.
 * @throws Exception
 */
public static void main(String[] args) throws Exception {
    VerifiableProperties verifiableProperties = ToolUtils.getVerifiableProperties(args);
    ServerAdminToolConfig config = new ServerAdminToolConfig(verifiableProperties);
    ClusterMapConfig clusterMapConfig = new ClusterMapConfig(verifiableProperties);
    ClusterMap clusterMap = ((ClusterAgentsFactory) Utils.getObj(clusterMapConfig.clusterMapClusterAgentsFactory, clusterMapConfig, config.hardwareLayoutFilePath, config.partitionLayoutFilePath)).getClusterMap();
    SSLFactory sslFactory = !clusterMapConfig.clusterMapSslEnabledDatacenters.isEmpty() ? SSLFactory.getNewInstance(new SSLConfig(verifiableProperties)) : null;
    ServerAdminTool serverAdminTool = new ServerAdminTool(clusterMap, sslFactory, verifiableProperties);
    File file = new File(config.dataOutputFilePath);
    if (!file.exists() && !file.createNewFile()) {
        throw new IllegalStateException("Could not create " + file);
    }
    FileOutputStream outputFileStream = new FileOutputStream(config.dataOutputFilePath);
    DataNodeId dataNodeId = clusterMap.getDataNodeId(config.hostname, config.port);
    if (dataNodeId == null) {
        throw new IllegalArgumentException("Could not find a data node corresponding to " + config.hostname + ":" + config.port);
    }
    switch(config.typeOfOperation) {
        case GetBlobProperties:
            BlobId blobId = new BlobId(config.blobId, clusterMap);
            Pair<ServerErrorCode, BlobProperties> bpResponse = serverAdminTool.getBlobProperties(dataNodeId, blobId, config.getOption, clusterMap);
            if (bpResponse.getFirst() == ServerErrorCode.No_Error) {
                LOGGER.info("Blob properties for {} from {}: {}", blobId, dataNodeId, bpResponse.getSecond());
            } else {
                LOGGER.error("Failed to get blob properties for {} from {} with option {}. Error code is {}", blobId, dataNodeId, config.getOption, bpResponse.getFirst());
            }
            break;
        case GetUserMetadata:
            blobId = new BlobId(config.blobId, clusterMap);
            Pair<ServerErrorCode, ByteBuffer> umResponse = serverAdminTool.getUserMetadata(dataNodeId, blobId, config.getOption, clusterMap);
            if (umResponse.getFirst() == ServerErrorCode.No_Error) {
                writeBufferToFile(umResponse.getSecond(), outputFileStream);
                LOGGER.info("User metadata for {} from {} written to {}", blobId, dataNodeId, config.dataOutputFilePath);
            } else {
                LOGGER.error("Failed to get user metadata for {} from {} with option {}. Error code is {}", blobId, dataNodeId, config.getOption, umResponse.getFirst());
            }
            break;
        case GetBlob:
            blobId = new BlobId(config.blobId, clusterMap);
            Pair<ServerErrorCode, BlobData> bResponse = serverAdminTool.getBlob(dataNodeId, blobId, config.getOption, clusterMap);
            if (bResponse.getFirst() == ServerErrorCode.No_Error) {
                LOGGER.info("Blob type of {} from {} is {}", blobId, dataNodeId, bResponse.getSecond().getBlobType());
                ByteBuf buffer = bResponse.getSecond().content();
                try {
                    writeByteBufToFile(buffer, outputFileStream);
                } finally {
                    buffer.release();
                }
                LOGGER.info("Blob data for {} from {} written to {}", blobId, dataNodeId, config.dataOutputFilePath);
            } else {
                LOGGER.error("Failed to get blob data for {} from {} with option {}. Error code is {}", blobId, dataNodeId, config.getOption, bResponse.getFirst());
            }
            break;
        case TriggerCompaction:
            if (config.partitionIds.length > 0 && !config.partitionIds[0].isEmpty()) {
                for (String partitionIdStr : config.partitionIds) {
                    PartitionId partitionId = getPartitionIdFromStr(partitionIdStr, clusterMap);
                    ServerErrorCode errorCode = serverAdminTool.triggerCompaction(dataNodeId, partitionId);
                    if (errorCode == ServerErrorCode.No_Error) {
                        LOGGER.info("Compaction has been triggered for {} on {}", partitionId, dataNodeId);
                    } else {
                        LOGGER.error("From {}, received server error code {} for trigger compaction request on {}", dataNodeId, errorCode, partitionId);
                    }
                }
            } else {
                LOGGER.error("There were no partitions provided to trigger compaction on");
            }
            break;
        case RequestControl:
            if (config.partitionIds.length > 0 && !config.partitionIds[0].isEmpty()) {
                for (String partitionIdStr : config.partitionIds) {
                    PartitionId partitionId = getPartitionIdFromStr(partitionIdStr, clusterMap);
                    sendRequestControlRequest(serverAdminTool, dataNodeId, partitionId, config.requestTypeToControl, config.enableState);
                }
            } else {
                LOGGER.info("No partition list provided. Requesting enable status of {} to be set to {} on all partitions", config.requestTypeToControl, config.enableState);
                sendRequestControlRequest(serverAdminTool, dataNodeId, null, config.requestTypeToControl, config.enableState);
            }
            break;
        case ReplicationControl:
            List<String> origins = Collections.emptyList();
            if (config.origins.length > 0 && !config.origins[0].isEmpty()) {
                origins = Arrays.asList(config.origins);
            }
            if (config.partitionIds.length > 0 && !config.partitionIds[0].isEmpty()) {
                for (String partitionIdStr : config.partitionIds) {
                    PartitionId partitionId = getPartitionIdFromStr(partitionIdStr, clusterMap);
                    sendReplicationControlRequest(serverAdminTool, dataNodeId, partitionId, origins, config.enableState);
                }
            } else {
                LOGGER.info("No partition list provided. Requesting enable status for replication from {} to be set to {} on " + "all partitions", origins.isEmpty() ? "all DCs" : origins, config.enableState);
                sendReplicationControlRequest(serverAdminTool, dataNodeId, null, origins, config.enableState);
            }
            break;
        case CatchupStatus:
            if (config.partitionIds.length > 0 && !config.partitionIds[0].isEmpty()) {
                for (String partitionIdStr : config.partitionIds) {
                    PartitionId partitionId = getPartitionIdFromStr(partitionIdStr, clusterMap);
                    Pair<ServerErrorCode, Boolean> response = serverAdminTool.isCaughtUp(dataNodeId, partitionId, config.acceptableLagInBytes, config.numReplicasCaughtUpPerPartition);
                    if (response.getFirst() == ServerErrorCode.No_Error) {
                        LOGGER.info("Replicas are {} within {} bytes for {}", response.getSecond() ? "" : "NOT", config.acceptableLagInBytes, partitionId);
                    } else {
                        LOGGER.error("From {}, received server error code {} for request for catchup status of {}", dataNodeId, response.getFirst(), partitionId);
                    }
                }
            } else {
                Pair<ServerErrorCode, Boolean> response = serverAdminTool.isCaughtUp(dataNodeId, null, config.acceptableLagInBytes, config.numReplicasCaughtUpPerPartition);
                if (response.getFirst() == ServerErrorCode.No_Error) {
                    LOGGER.info("Replicas are {} within {} bytes for all partitions", response.getSecond() ? "" : "NOT", config.acceptableLagInBytes);
                } else {
                    LOGGER.error("From {}, received server error code {} for request for catchup status of all partitions", dataNodeId, response.getFirst());
                }
            }
            break;
        case BlobStoreControl:
            if (config.partitionIds.length > 0 && !config.partitionIds[0].isEmpty()) {
                for (String partitionIdStr : config.partitionIds) {
                    PartitionId partitionId = getPartitionIdFromStr(partitionIdStr, clusterMap);
                    sendBlobStoreControlRequest(serverAdminTool, dataNodeId, partitionId, config.numReplicasCaughtUpPerPartition, config.storeControlRequestType);
                }
            } else {
                LOGGER.error("There were no partitions provided to be controlled (Start/Stop)");
            }
            break;
        default:
            throw new IllegalStateException("Recognized but unsupported operation: " + config.typeOfOperation);
    }
    serverAdminTool.close();
    outputFileStream.close();
    clusterMap.close();
    System.out.println("Server admin tool is safely closed");
    System.exit(0);
}
Also used : ClusterMap(com.github.ambry.clustermap.ClusterMap) SSLFactory(com.github.ambry.commons.SSLFactory) ByteBuf(io.netty.buffer.ByteBuf) BlobData(com.github.ambry.messageformat.BlobData) ClusterAgentsFactory(com.github.ambry.clustermap.ClusterAgentsFactory) SSLConfig(com.github.ambry.config.SSLConfig) VerifiableProperties(com.github.ambry.config.VerifiableProperties) PartitionId(com.github.ambry.clustermap.PartitionId) ByteBuffer(java.nio.ByteBuffer) ClusterMapConfig(com.github.ambry.config.ClusterMapConfig) ServerErrorCode(com.github.ambry.server.ServerErrorCode) FileOutputStream(java.io.FileOutputStream) BlobProperties(com.github.ambry.messageformat.BlobProperties) File(java.io.File) DataNodeId(com.github.ambry.clustermap.DataNodeId) BlobId(com.github.ambry.commons.BlobId)

Aggregations

DataNodeId (com.github.ambry.clustermap.DataNodeId)92 ArrayList (java.util.ArrayList)45 Test (org.junit.Test)45 HashMap (java.util.HashMap)29 PartitionId (com.github.ambry.clustermap.PartitionId)28 MockDataNodeId (com.github.ambry.clustermap.MockDataNodeId)27 ReplicaId (com.github.ambry.clustermap.ReplicaId)25 MockClusterMap (com.github.ambry.clustermap.MockClusterMap)23 VerifiableProperties (com.github.ambry.config.VerifiableProperties)23 MetricRegistry (com.codahale.metrics.MetricRegistry)22 MockPartitionId (com.github.ambry.clustermap.MockPartitionId)22 List (java.util.List)22 Map (java.util.Map)22 Port (com.github.ambry.network.Port)21 ClusterMap (com.github.ambry.clustermap.ClusterMap)20 ClusterMapConfig (com.github.ambry.config.ClusterMapConfig)19 StoreKeyFactory (com.github.ambry.store.StoreKeyFactory)18 BlobIdFactory (com.github.ambry.commons.BlobIdFactory)17 HashSet (java.util.HashSet)16 Properties (java.util.Properties)16