use of com.github.ambry.clustermap.PartitionId in project ambry by linkedin.
the class StatsManager method getNodeStatsInJSON.
/**
* Get the combined {@link StatsSnapshot} of all partitions in this node. This json will contain one entry per partition
* wrt valid data size.
* @return a combined {@link StatsSnapshot} of this node
*/
String getNodeStatsInJSON() {
String statsWrapperJSON = "";
try {
long totalFetchAndAggregateStartTimeMs = time.milliseconds();
StatsSnapshot combinedSnapshot = new StatsSnapshot(0L, new HashMap<String, StatsSnapshot>());
long totalValue = 0;
List<String> unreachableStores = new ArrayList<>();
Iterator<PartitionId> iterator = totalPartitionIds.iterator();
while (iterator.hasNext()) {
PartitionId partitionId = iterator.next();
long fetchSnapshotStartTimeMs = time.milliseconds();
StatsSnapshot statsSnapshot = fetchSnapshot(partitionId, unreachableStores);
if (statsSnapshot != null) {
combinedSnapshot.getSubMap().put(partitionId.toString(), statsSnapshot);
totalValue += statsSnapshot.getValue();
}
metrics.fetchAndAggregateTimePerStoreMs.update(time.milliseconds() - fetchSnapshotStartTimeMs);
}
combinedSnapshot.setValue(totalValue);
metrics.totalFetchAndAggregateTimeMs.update(time.milliseconds() - totalFetchAndAggregateStartTimeMs);
StatsHeader statsHeader = new StatsHeader(StatsHeader.StatsDescription.QUOTA, time.milliseconds(), totalPartitionIds.size(), totalPartitionIds.size() - unreachableStores.size(), unreachableStores);
statsWrapperJSON = mapper.writeValueAsString(new StatsWrapper(statsHeader, combinedSnapshot));
} catch (Exception | Error e) {
metrics.statsAggregationFailureCount.inc();
logger.error("Exception while aggregating stats.", e);
}
return statsWrapperJSON;
}
use of com.github.ambry.clustermap.PartitionId in project ambry by linkedin.
the class AmbryRequestsTest method stopBlobStoreFailureTest.
/**
* Tests for the response received on a {@link BlobStoreControlAdminRequest} for different failure cases
* @throws InterruptedException
* @throws IOException
*/
@Test
public void stopBlobStoreFailureTest() throws InterruptedException, IOException {
List<? extends PartitionId> partitionIds = clusterMap.getAllPartitionIds();
PartitionId id = partitionIds.get(0);
int correlationId = TestUtils.RANDOM.nextInt();
String clientId = UtilsTest.getRandomString(10);
short numReplicasCaughtUpPerPartition = -1;
// test invalid numReplicasCaughtUpPerPartition
AdminRequest adminRequest = new AdminRequest(AdminRequestOrResponseType.BlobStoreControl, id, correlationId, clientId);
BlobStoreControlAdminRequest blobStoreControlAdminRequest = new BlobStoreControlAdminRequest(numReplicasCaughtUpPerPartition, false, adminRequest);
Response response = sendRequestGetResponse(blobStoreControlAdminRequest, ServerErrorCode.Bad_Request);
assertTrue("Response not of type AdminResponse", response instanceof AdminResponse);
// test partition unknown
numReplicasCaughtUpPerPartition = 3;
adminRequest = new AdminRequest(AdminRequestOrResponseType.BlobStoreControl, null, correlationId, clientId);
blobStoreControlAdminRequest = new BlobStoreControlAdminRequest(numReplicasCaughtUpPerPartition, false, adminRequest);
response = sendRequestGetResponse(blobStoreControlAdminRequest, ServerErrorCode.Partition_Unknown);
assertTrue("Response not of type AdminResponse", response instanceof AdminResponse);
// test validate request failure
storageManager.returnNullStore = true;
adminRequest = new AdminRequest(AdminRequestOrResponseType.BlobStoreControl, id, correlationId, clientId);
blobStoreControlAdminRequest = new BlobStoreControlAdminRequest(numReplicasCaughtUpPerPartition, false, adminRequest);
response = sendRequestGetResponse(blobStoreControlAdminRequest, ServerErrorCode.Disk_Unavailable);
assertTrue("Response not of type AdminResponse", response instanceof AdminResponse);
storageManager.returnNullStore = false;
// test disable compaction failure
adminRequest = new AdminRequest(AdminRequestOrResponseType.BlobStoreControl, id, correlationId, clientId);
blobStoreControlAdminRequest = new BlobStoreControlAdminRequest(numReplicasCaughtUpPerPartition, false, adminRequest);
storageManager.returnValueOfDisablingCompaction = false;
response = sendRequestGetResponse(blobStoreControlAdminRequest, ServerErrorCode.Unknown_Error);
assertTrue("Response not of type AdminResponse", response instanceof AdminResponse);
storageManager.returnValueOfDisablingCompaction = true;
// test disable compaction with runtime exception
adminRequest = new AdminRequest(AdminRequestOrResponseType.BlobStoreControl, id, correlationId, clientId);
blobStoreControlAdminRequest = new BlobStoreControlAdminRequest(numReplicasCaughtUpPerPartition, false, adminRequest);
storageManager.exceptionToThrowOnDisablingCompaction = new IllegalStateException();
response = sendRequestGetResponse(blobStoreControlAdminRequest, ServerErrorCode.Unknown_Error);
assertTrue("Response not of type AdminResponse", response instanceof AdminResponse);
storageManager.exceptionToThrowOnDisablingCompaction = null;
// test disable replication failure
replicationManager.reset();
replicationManager.controlReplicationReturnVal = false;
adminRequest = new AdminRequest(AdminRequestOrResponseType.BlobStoreControl, id, correlationId, clientId);
blobStoreControlAdminRequest = new BlobStoreControlAdminRequest(numReplicasCaughtUpPerPartition, false, adminRequest);
response = sendRequestGetResponse(blobStoreControlAdminRequest, ServerErrorCode.Unknown_Error);
assertTrue("Response not of type AdminResponse", response instanceof AdminResponse);
// test peers catchup failure
replicationManager.reset();
replicationManager.controlReplicationReturnVal = true;
// all replicas of this partition > acceptableLag
generateLagOverrides(1, 1);
adminRequest = new AdminRequest(AdminRequestOrResponseType.BlobStoreControl, id, correlationId, clientId);
blobStoreControlAdminRequest = new BlobStoreControlAdminRequest(numReplicasCaughtUpPerPartition, false, adminRequest);
response = sendRequestGetResponse(blobStoreControlAdminRequest, ServerErrorCode.Retry_After_Backoff);
assertTrue("Response not of type AdminResponse", response instanceof AdminResponse);
// test shutdown BlobStore failure
replicationManager.reset();
replicationManager.controlReplicationReturnVal = true;
storageManager.returnValueOfShutdownBlobStore = false;
generateLagOverrides(0, 0);
adminRequest = new AdminRequest(AdminRequestOrResponseType.BlobStoreControl, id, correlationId, clientId);
blobStoreControlAdminRequest = new BlobStoreControlAdminRequest(numReplicasCaughtUpPerPartition, false, adminRequest);
response = sendRequestGetResponse(blobStoreControlAdminRequest, ServerErrorCode.Unknown_Error);
assertTrue("Response not of type AdminResponse", response instanceof AdminResponse);
// test shutdown BlobStore with runtime exception
storageManager.exceptionToThrowOnShuttingdownBlobStore = new IllegalStateException();
adminRequest = new AdminRequest(AdminRequestOrResponseType.BlobStoreControl, id, correlationId, clientId);
blobStoreControlAdminRequest = new BlobStoreControlAdminRequest(numReplicasCaughtUpPerPartition, false, adminRequest);
response = sendRequestGetResponse(blobStoreControlAdminRequest, ServerErrorCode.Unknown_Error);
assertTrue("Response not of type AdminResponse", response instanceof AdminResponse);
storageManager.exceptionToThrowOnShuttingdownBlobStore = null;
}
use of com.github.ambry.clustermap.PartitionId in project ambry by linkedin.
the class AmbryRequestsTest method catchupStatusSuccessTest.
/**
* Tests for the response received on a {@link CatchupStatusAdminRequest} for different cases
* @throws InterruptedException
* @throws IOException
*/
@Test
public void catchupStatusSuccessTest() throws InterruptedException, IOException {
List<? extends PartitionId> partitionIds = clusterMap.getAllPartitionIds();
assertTrue("This test needs more than one partition to work", partitionIds.size() > 1);
PartitionId id = partitionIds.get(0);
ReplicaId thisPartRemoteRep = getRemoteReplicaId(id);
ReplicaId otherPartRemoteRep = getRemoteReplicaId(partitionIds.get(1));
List<? extends ReplicaId> replicaIds = id.getReplicaIds();
assertTrue("This test needs more than one replica for the first partition to work", replicaIds.size() > 1);
long acceptableLagInBytes = 100;
// cases with a given partition id
// all replicas of given partition < acceptableLag
generateLagOverrides(0, acceptableLagInBytes - 1);
doCatchupStatusTest(id, acceptableLagInBytes, Short.MAX_VALUE, ServerErrorCode.No_Error, true);
// all replicas of given partition = acceptableLag
generateLagOverrides(acceptableLagInBytes, acceptableLagInBytes);
doCatchupStatusTest(id, acceptableLagInBytes, Short.MAX_VALUE, ServerErrorCode.No_Error, true);
// 1 replica of some other partition > acceptableLag
String key = MockReplicationManager.getPartitionLagKey(otherPartRemoteRep.getPartitionId(), otherPartRemoteRep.getDataNodeId().getHostname(), otherPartRemoteRep.getReplicaPath());
replicationManager.lagOverrides.put(key, acceptableLagInBytes + 1);
doCatchupStatusTest(id, acceptableLagInBytes, Short.MAX_VALUE, ServerErrorCode.No_Error, true);
// 1 replica of this partition > acceptableLag
key = MockReplicationManager.getPartitionLagKey(id, thisPartRemoteRep.getDataNodeId().getHostname(), thisPartRemoteRep.getReplicaPath());
replicationManager.lagOverrides.put(key, acceptableLagInBytes + 1);
doCatchupStatusTest(id, acceptableLagInBytes, Short.MAX_VALUE, ServerErrorCode.No_Error, false);
// same result if num expected replicas == total count -1.
doCatchupStatusTest(id, acceptableLagInBytes, (short) (replicaIds.size() - 1), ServerErrorCode.No_Error, false);
// caught up if num expected replicas == total count - 2
doCatchupStatusTest(id, acceptableLagInBytes, (short) (replicaIds.size() - 2), ServerErrorCode.No_Error, true);
// caught up if num expected replicas == total count - 3
doCatchupStatusTest(id, acceptableLagInBytes, (short) (replicaIds.size() - 3), ServerErrorCode.No_Error, true);
// all replicas of this partition > acceptableLag
generateLagOverrides(acceptableLagInBytes + 1, acceptableLagInBytes + 1);
doCatchupStatusTest(id, acceptableLagInBytes, Short.MAX_VALUE, ServerErrorCode.No_Error, false);
// cases with no partition id provided
// all replicas of all partitions < acceptableLag
generateLagOverrides(0, acceptableLagInBytes - 1);
doCatchupStatusTest(null, acceptableLagInBytes, Short.MAX_VALUE, ServerErrorCode.No_Error, true);
// all replicas of all partitions = acceptableLag
generateLagOverrides(acceptableLagInBytes, acceptableLagInBytes);
doCatchupStatusTest(null, acceptableLagInBytes, Short.MAX_VALUE, ServerErrorCode.No_Error, true);
// 1 replica of one partition > acceptableLag
key = MockReplicationManager.getPartitionLagKey(id, thisPartRemoteRep.getDataNodeId().getHostname(), thisPartRemoteRep.getReplicaPath());
replicationManager.lagOverrides.put(key, acceptableLagInBytes + 1);
doCatchupStatusTest(null, acceptableLagInBytes, Short.MAX_VALUE, ServerErrorCode.No_Error, false);
// same result if num expected replicas == total count -1.
doCatchupStatusTest(null, acceptableLagInBytes, (short) (replicaIds.size() - 1), ServerErrorCode.No_Error, false);
// caught up if num expected replicas == total count - 2
doCatchupStatusTest(null, acceptableLagInBytes, (short) (replicaIds.size() - 2), ServerErrorCode.No_Error, true);
// caught up if num expected replicas == total count - 3
doCatchupStatusTest(null, acceptableLagInBytes, (short) (replicaIds.size() - 3), ServerErrorCode.No_Error, true);
// all replicas of all partitions > acceptableLag
generateLagOverrides(acceptableLagInBytes + 1, acceptableLagInBytes + 1);
doCatchupStatusTest(null, acceptableLagInBytes, Short.MAX_VALUE, ServerErrorCode.No_Error, false);
}
use of com.github.ambry.clustermap.PartitionId in project ambry by linkedin.
the class AmbryRequestsTest method controlReplicationSuccessTest.
/**
* Tests that {@link AdminRequestOrResponseType#ReplicationControl} works correctly.
* @throws InterruptedException
* @throws IOException
*/
@Test
public void controlReplicationSuccessTest() throws InterruptedException, IOException {
List<? extends PartitionId> partitionIds = clusterMap.getWritablePartitionIds();
for (PartitionId id : partitionIds) {
doControlReplicationTest(id, ServerErrorCode.No_Error);
}
doControlReplicationTest(null, ServerErrorCode.No_Error);
}
use of com.github.ambry.clustermap.PartitionId in project ambry by linkedin.
the class AmbryRequestsTest method generateLagOverrides.
/**
* Generates lag overrides in {@code replicationManager} with each lag a number between {@code base} and
* {@code upperBound} both inclusive.
* @param base the minimum value of lag (inclusive)
* @param upperBound the maximum value of lag (inclusive)
*/
private void generateLagOverrides(long base, long upperBound) {
replicationManager.lagOverrides = new HashMap<>();
for (PartitionId partitionId : clusterMap.getAllPartitionIds()) {
for (ReplicaId replicaId : partitionId.getReplicaIds()) {
String key = MockReplicationManager.getPartitionLagKey(partitionId, replicaId.getDataNodeId().getHostname(), replicaId.getReplicaPath());
Long value = base + Utils.getRandomLong(TestUtils.RANDOM, upperBound - base + 1);
replicationManager.lagOverrides.put(key, value);
}
}
}
Aggregations