use of com.github.ambry.clustermap.DataNodeId in project ambry by linkedin.
the class ReplicaThread method replicate.
/**
* Replicas from the given replicas
* @param replicasToReplicate list of {@link RemoteReplicaInfo} by data node
*/
void replicate(List<List<RemoteReplicaInfo>> replicasToReplicate) {
// shuffle the nodes
Collections.shuffle(replicasToReplicate);
for (List<RemoteReplicaInfo> replicasToReplicatePerNode : replicasToReplicate) {
if (!running) {
break;
}
DataNodeId remoteNode = replicasToReplicatePerNode.get(0).getReplicaId().getDataNodeId();
logger.trace("Remote node: {} Thread name: {} Remote replicas: {}", remoteNode, threadName, replicasToReplicatePerNode);
Timer.Context context = null;
Timer.Context portTypeBasedContext = null;
if (replicatingFromRemoteColo) {
context = replicationMetrics.interColoReplicationLatency.get(remoteNode.getDatacenterName()).time();
if (replicatingOverSsl) {
portTypeBasedContext = replicationMetrics.sslInterColoReplicationLatency.get(remoteNode.getDatacenterName()).time();
} else {
portTypeBasedContext = replicationMetrics.plainTextInterColoReplicationLatency.get(remoteNode.getDatacenterName()).time();
}
} else {
context = replicationMetrics.intraColoReplicationLatency.time();
if (replicatingOverSsl) {
portTypeBasedContext = replicationMetrics.sslIntraColoReplicationLatency.time();
} else {
portTypeBasedContext = replicationMetrics.plainTextIntraColoReplicationLatency.time();
}
}
ConnectedChannel connectedChannel = null;
long checkoutConnectionTimeInMs = -1;
long exchangeMetadataTimeInMs = -1;
long fixMissingStoreKeysTimeInMs = -1;
long replicationStartTimeInMs = SystemTime.getInstance().milliseconds();
long startTimeInMs = replicationStartTimeInMs;
List<RemoteReplicaInfo> activeReplicasPerNode = new ArrayList<RemoteReplicaInfo>();
for (RemoteReplicaInfo remoteReplicaInfo : replicasToReplicatePerNode) {
ReplicaId replicaId = remoteReplicaInfo.getReplicaId();
if (!replicationDisabledPartitions.contains(replicaId.getPartitionId()) && !replicaId.isDown()) {
activeReplicasPerNode.add(remoteReplicaInfo);
}
}
if (activeReplicasPerNode.size() > 0) {
try {
connectedChannel = connectionPool.checkOutConnection(remoteNode.getHostname(), activeReplicasPerNode.get(0).getPort(), replicationConfig.replicationConnectionPoolCheckoutTimeoutMs);
checkoutConnectionTimeInMs = SystemTime.getInstance().milliseconds() - startTimeInMs;
startTimeInMs = SystemTime.getInstance().milliseconds();
List<ExchangeMetadataResponse> exchangeMetadataResponseList = exchangeMetadata(connectedChannel, activeReplicasPerNode);
exchangeMetadataTimeInMs = SystemTime.getInstance().milliseconds() - startTimeInMs;
startTimeInMs = SystemTime.getInstance().milliseconds();
fixMissingStoreKeys(connectedChannel, activeReplicasPerNode, exchangeMetadataResponseList);
fixMissingStoreKeysTimeInMs = SystemTime.getInstance().milliseconds() - startTimeInMs;
} catch (Throwable e) {
if (checkoutConnectionTimeInMs == -1) {
// throwable happened in checkout connection phase
checkoutConnectionTimeInMs = SystemTime.getInstance().milliseconds() - startTimeInMs;
responseHandler.onEvent(activeReplicasPerNode.get(0).getReplicaId(), e);
} else if (exchangeMetadataTimeInMs == -1) {
// throwable happened in exchange metadata phase
exchangeMetadataTimeInMs = SystemTime.getInstance().milliseconds() - startTimeInMs;
} else if (fixMissingStoreKeysTimeInMs == -1) {
// throwable happened in fix missing store phase
fixMissingStoreKeysTimeInMs = SystemTime.getInstance().milliseconds() - startTimeInMs;
}
logger.error("Error while talking to peer: Remote node: {}, Thread name: {}, Remote replicas: {}, Active " + "remote replicas: {}, Checkout connection time: {}, Exchange metadata time: {}, Fix missing store key " + "time {}", remoteNode, threadName, replicasToReplicatePerNode, activeReplicasPerNode, checkoutConnectionTimeInMs, exchangeMetadataTimeInMs, fixMissingStoreKeysTimeInMs, e);
replicationMetrics.incrementReplicationErrors(replicatingOverSsl);
if (connectedChannel != null) {
connectionPool.destroyConnection(connectedChannel);
connectedChannel = null;
}
} finally {
long totalReplicationTime = SystemTime.getInstance().milliseconds() - replicationStartTimeInMs;
replicationMetrics.updateTotalReplicationTime(totalReplicationTime, replicatingFromRemoteColo, replicatingOverSsl, datacenterName);
if (connectedChannel != null) {
connectionPool.checkInConnection(connectedChannel);
}
context.stop();
portTypeBasedContext.stop();
}
}
}
}
use of com.github.ambry.clustermap.DataNodeId in project ambry by linkedin.
the class ReplicationMetrics method addRemoteReplicaToLagMetrics.
public void addRemoteReplicaToLagMetrics(final RemoteReplicaInfo remoteReplicaInfo) {
ReplicaId replicaId = remoteReplicaInfo.getReplicaId();
DataNodeId dataNodeId = replicaId.getDataNodeId();
final String metricName = dataNodeId.getHostname() + "-" + dataNodeId.getPort() + "-" + replicaId.getPartitionId() + "-replicaLagInBytes";
Gauge<Long> replicaLag = new Gauge<Long>() {
@Override
public Long getValue() {
return remoteReplicaInfo.getRemoteLagFromLocalInBytes();
}
};
registry.register(MetricRegistry.name(ReplicationMetrics.class, metricName), replicaLag);
replicaLagInBytes.add(replicaLag);
}
use of com.github.ambry.clustermap.DataNodeId in project ambry by linkedin.
the class BlobValidator method getMismatchDetails.
/**
* Compares {@link ServerResponse} for a list of replicas.
* @param blobId the blobId for which the comparison is made
* @param dataNodeIdBlobContentMap the map containing the replica to their respective {@link ServerResponse}
* @return a list of details if there are mismatches. Zero sized list if there aren't any mismatches.
*/
private List<String> getMismatchDetails(String blobId, Map<DataNodeId, ServerResponse> dataNodeIdBlobContentMap) {
List<String> mismatchDetailsList = new ArrayList<>();
Iterator<DataNodeId> dataNodeIdIterator = dataNodeIdBlobContentMap.keySet().iterator();
DataNodeId dataNodeId1 = dataNodeIdIterator.next();
ServerResponse dataNode1ServerResponse = dataNodeIdBlobContentMap.get(dataNodeId1);
while (dataNodeIdIterator.hasNext()) {
DataNodeId dataNodeId2 = dataNodeIdIterator.next();
ServerResponse dataNode2ServerResponse = dataNodeIdBlobContentMap.get(dataNodeId2);
String mismatchDetails = dataNode1ServerResponse.getMismatchDetails(dataNode2ServerResponse);
if (mismatchDetails != null) {
mismatchDetails = "Mismatch for [" + blobId + "] between [" + dataNodeId1 + "] and [" + dataNodeId2 + "] - " + mismatchDetails;
mismatchDetailsList.add(mismatchDetails);
}
}
return mismatchDetailsList;
}
use of com.github.ambry.clustermap.DataNodeId in project ambry by linkedin.
the class BlobValidator method validateBlobOnAllReplicas.
/**
* Validates {@code blobId} on all of its replicas.
* @param blobId the {@link BlobId} to operate on.
* @param getOption the {@link GetOption} to use with the {@link com.github.ambry.protocol.GetRequest}.
* @param clusterMap the {@link ClusterMap} instance to use.
* @param storeKeyFactory the {@link StoreKeyFactory} to use.
* @return a list of details if there are mismatches. Zero sized list if there aren't any mismatches.
* @throws InterruptedException
*/
private List<String> validateBlobOnAllReplicas(BlobId blobId, GetOption getOption, ClusterMap clusterMap, StoreKeyFactory storeKeyFactory) throws InterruptedException {
Map<DataNodeId, ServerResponse> dataNodeIdBlobContentMap = new HashMap<>();
for (ReplicaId replicaId : blobId.getPartition().getReplicaIds()) {
ServerResponse response = getRecordFromNode(replicaId.getDataNodeId(), blobId, getOption, clusterMap, storeKeyFactory);
dataNodeIdBlobContentMap.put(replicaId.getDataNodeId(), response);
}
return getMismatchDetails(blobId.getID(), dataNodeIdBlobContentMap);
}
use of com.github.ambry.clustermap.DataNodeId in project ambry by linkedin.
the class ServerAdminTool method main.
/**
* Runs the server admin tool
* @param args associated arguments.
* @throws Exception
*/
public static void main(String[] args) throws Exception {
VerifiableProperties verifiableProperties = ToolUtils.getVerifiableProperties(args);
ServerAdminToolConfig config = new ServerAdminToolConfig(verifiableProperties);
ClusterMapConfig clusterMapConfig = new ClusterMapConfig(verifiableProperties);
ClusterMap clusterMap = ((ClusterAgentsFactory) Utils.getObj(clusterMapConfig.clusterMapClusterAgentsFactory, clusterMapConfig, config.hardwareLayoutFilePath, config.partitionLayoutFilePath)).getClusterMap();
SSLFactory sslFactory = !clusterMapConfig.clusterMapSslEnabledDatacenters.isEmpty() ? SSLFactory.getNewInstance(new SSLConfig(verifiableProperties)) : null;
ServerAdminTool serverAdminTool = new ServerAdminTool(clusterMap, sslFactory, verifiableProperties);
File file = new File(config.dataOutputFilePath);
if (!file.exists() && !file.createNewFile()) {
throw new IllegalStateException("Could not create " + file);
}
FileOutputStream outputFileStream = new FileOutputStream(config.dataOutputFilePath);
DataNodeId dataNodeId = clusterMap.getDataNodeId(config.hostname, config.port);
if (dataNodeId == null) {
throw new IllegalArgumentException("Could not find a data node corresponding to " + config.hostname + ":" + config.port);
}
switch(config.typeOfOperation) {
case GetBlobProperties:
BlobId blobId = new BlobId(config.blobId, clusterMap);
Pair<ServerErrorCode, BlobProperties> bpResponse = serverAdminTool.getBlobProperties(dataNodeId, blobId, config.getOption, clusterMap);
if (bpResponse.getFirst() == ServerErrorCode.No_Error) {
LOGGER.info("Blob properties for {} from {}: {}", blobId, dataNodeId, bpResponse.getSecond());
} else {
LOGGER.error("Failed to get blob properties for {} from {} with option {}. Error code is {}", blobId, dataNodeId, config.getOption, bpResponse.getFirst());
}
break;
case GetUserMetadata:
blobId = new BlobId(config.blobId, clusterMap);
Pair<ServerErrorCode, ByteBuffer> umResponse = serverAdminTool.getUserMetadata(dataNodeId, blobId, config.getOption, clusterMap);
if (umResponse.getFirst() == ServerErrorCode.No_Error) {
writeBufferToFile(umResponse.getSecond(), outputFileStream);
LOGGER.info("User metadata for {} from {} written to {}", blobId, dataNodeId, config.dataOutputFilePath);
} else {
LOGGER.error("Failed to get user metadata for {} from {} with option {}. Error code is {}", blobId, dataNodeId, config.getOption, umResponse.getFirst());
}
break;
case GetBlob:
blobId = new BlobId(config.blobId, clusterMap);
Pair<ServerErrorCode, BlobData> bResponse = serverAdminTool.getBlob(dataNodeId, blobId, config.getOption, clusterMap);
if (bResponse.getFirst() == ServerErrorCode.No_Error) {
LOGGER.info("Blob type of {} from {} is {}", blobId, dataNodeId, bResponse.getSecond().getBlobType());
ByteBuf buffer = bResponse.getSecond().content();
try {
writeByteBufToFile(buffer, outputFileStream);
} finally {
buffer.release();
}
LOGGER.info("Blob data for {} from {} written to {}", blobId, dataNodeId, config.dataOutputFilePath);
} else {
LOGGER.error("Failed to get blob data for {} from {} with option {}. Error code is {}", blobId, dataNodeId, config.getOption, bResponse.getFirst());
}
break;
case TriggerCompaction:
if (config.partitionIds.length > 0 && !config.partitionIds[0].isEmpty()) {
for (String partitionIdStr : config.partitionIds) {
PartitionId partitionId = getPartitionIdFromStr(partitionIdStr, clusterMap);
ServerErrorCode errorCode = serverAdminTool.triggerCompaction(dataNodeId, partitionId);
if (errorCode == ServerErrorCode.No_Error) {
LOGGER.info("Compaction has been triggered for {} on {}", partitionId, dataNodeId);
} else {
LOGGER.error("From {}, received server error code {} for trigger compaction request on {}", dataNodeId, errorCode, partitionId);
}
}
} else {
LOGGER.error("There were no partitions provided to trigger compaction on");
}
break;
case RequestControl:
if (config.partitionIds.length > 0 && !config.partitionIds[0].isEmpty()) {
for (String partitionIdStr : config.partitionIds) {
PartitionId partitionId = getPartitionIdFromStr(partitionIdStr, clusterMap);
sendRequestControlRequest(serverAdminTool, dataNodeId, partitionId, config.requestTypeToControl, config.enableState);
}
} else {
LOGGER.info("No partition list provided. Requesting enable status of {} to be set to {} on all partitions", config.requestTypeToControl, config.enableState);
sendRequestControlRequest(serverAdminTool, dataNodeId, null, config.requestTypeToControl, config.enableState);
}
break;
case ReplicationControl:
List<String> origins = Collections.emptyList();
if (config.origins.length > 0 && !config.origins[0].isEmpty()) {
origins = Arrays.asList(config.origins);
}
if (config.partitionIds.length > 0 && !config.partitionIds[0].isEmpty()) {
for (String partitionIdStr : config.partitionIds) {
PartitionId partitionId = getPartitionIdFromStr(partitionIdStr, clusterMap);
sendReplicationControlRequest(serverAdminTool, dataNodeId, partitionId, origins, config.enableState);
}
} else {
LOGGER.info("No partition list provided. Requesting enable status for replication from {} to be set to {} on " + "all partitions", origins.isEmpty() ? "all DCs" : origins, config.enableState);
sendReplicationControlRequest(serverAdminTool, dataNodeId, null, origins, config.enableState);
}
break;
case CatchupStatus:
if (config.partitionIds.length > 0 && !config.partitionIds[0].isEmpty()) {
for (String partitionIdStr : config.partitionIds) {
PartitionId partitionId = getPartitionIdFromStr(partitionIdStr, clusterMap);
Pair<ServerErrorCode, Boolean> response = serverAdminTool.isCaughtUp(dataNodeId, partitionId, config.acceptableLagInBytes, config.numReplicasCaughtUpPerPartition);
if (response.getFirst() == ServerErrorCode.No_Error) {
LOGGER.info("Replicas are {} within {} bytes for {}", response.getSecond() ? "" : "NOT", config.acceptableLagInBytes, partitionId);
} else {
LOGGER.error("From {}, received server error code {} for request for catchup status of {}", dataNodeId, response.getFirst(), partitionId);
}
}
} else {
Pair<ServerErrorCode, Boolean> response = serverAdminTool.isCaughtUp(dataNodeId, null, config.acceptableLagInBytes, config.numReplicasCaughtUpPerPartition);
if (response.getFirst() == ServerErrorCode.No_Error) {
LOGGER.info("Replicas are {} within {} bytes for all partitions", response.getSecond() ? "" : "NOT", config.acceptableLagInBytes);
} else {
LOGGER.error("From {}, received server error code {} for request for catchup status of all partitions", dataNodeId, response.getFirst());
}
}
break;
case BlobStoreControl:
if (config.partitionIds.length > 0 && !config.partitionIds[0].isEmpty()) {
for (String partitionIdStr : config.partitionIds) {
PartitionId partitionId = getPartitionIdFromStr(partitionIdStr, clusterMap);
sendBlobStoreControlRequest(serverAdminTool, dataNodeId, partitionId, config.numReplicasCaughtUpPerPartition, config.storeControlRequestType);
}
} else {
LOGGER.error("There were no partitions provided to be controlled (Start/Stop)");
}
break;
default:
throw new IllegalStateException("Recognized but unsupported operation: " + config.typeOfOperation);
}
serverAdminTool.close();
outputFileStream.close();
clusterMap.close();
System.out.println("Server admin tool is safely closed");
System.exit(0);
}
Aggregations