Search in sources :

Example 6 with PartitionId

use of com.github.ambry.clustermap.PartitionId in project ambry by linkedin.

the class StatsManager method getNodeStatsInJSON.

/**
 * Get the combined {@link StatsSnapshot} of all partitions in this node. This json will contain one entry per partition
 * wrt valid data size.
 * @return a combined {@link StatsSnapshot} of this node
 */
String getNodeStatsInJSON() {
    String statsWrapperJSON = "";
    try {
        long totalFetchAndAggregateStartTimeMs = time.milliseconds();
        StatsSnapshot combinedSnapshot = new StatsSnapshot(0L, new HashMap<String, StatsSnapshot>());
        long totalValue = 0;
        List<String> unreachableStores = new ArrayList<>();
        Iterator<PartitionId> iterator = totalPartitionIds.iterator();
        while (iterator.hasNext()) {
            PartitionId partitionId = iterator.next();
            long fetchSnapshotStartTimeMs = time.milliseconds();
            StatsSnapshot statsSnapshot = fetchSnapshot(partitionId, unreachableStores);
            if (statsSnapshot != null) {
                combinedSnapshot.getSubMap().put(partitionId.toString(), statsSnapshot);
                totalValue += statsSnapshot.getValue();
            }
            metrics.fetchAndAggregateTimePerStoreMs.update(time.milliseconds() - fetchSnapshotStartTimeMs);
        }
        combinedSnapshot.setValue(totalValue);
        metrics.totalFetchAndAggregateTimeMs.update(time.milliseconds() - totalFetchAndAggregateStartTimeMs);
        StatsHeader statsHeader = new StatsHeader(StatsHeader.StatsDescription.QUOTA, time.milliseconds(), totalPartitionIds.size(), totalPartitionIds.size() - unreachableStores.size(), unreachableStores);
        statsWrapperJSON = mapper.writeValueAsString(new StatsWrapper(statsHeader, combinedSnapshot));
    } catch (Exception | Error e) {
        metrics.statsAggregationFailureCount.inc();
        logger.error("Exception while aggregating stats.", e);
    }
    return statsWrapperJSON;
}
Also used : ArrayList(java.util.ArrayList) PartitionId(com.github.ambry.clustermap.PartitionId) IOException(java.io.IOException) StoreException(com.github.ambry.store.StoreException)

Example 7 with PartitionId

use of com.github.ambry.clustermap.PartitionId in project ambry by linkedin.

the class AmbryRequestsTest method stopBlobStoreFailureTest.

/**
 * Tests for the response received on a {@link BlobStoreControlAdminRequest} for different failure cases
 * @throws InterruptedException
 * @throws IOException
 */
@Test
public void stopBlobStoreFailureTest() throws InterruptedException, IOException {
    List<? extends PartitionId> partitionIds = clusterMap.getAllPartitionIds();
    PartitionId id = partitionIds.get(0);
    int correlationId = TestUtils.RANDOM.nextInt();
    String clientId = UtilsTest.getRandomString(10);
    short numReplicasCaughtUpPerPartition = -1;
    // test invalid numReplicasCaughtUpPerPartition
    AdminRequest adminRequest = new AdminRequest(AdminRequestOrResponseType.BlobStoreControl, id, correlationId, clientId);
    BlobStoreControlAdminRequest blobStoreControlAdminRequest = new BlobStoreControlAdminRequest(numReplicasCaughtUpPerPartition, false, adminRequest);
    Response response = sendRequestGetResponse(blobStoreControlAdminRequest, ServerErrorCode.Bad_Request);
    assertTrue("Response not of type AdminResponse", response instanceof AdminResponse);
    // test partition unknown
    numReplicasCaughtUpPerPartition = 3;
    adminRequest = new AdminRequest(AdminRequestOrResponseType.BlobStoreControl, null, correlationId, clientId);
    blobStoreControlAdminRequest = new BlobStoreControlAdminRequest(numReplicasCaughtUpPerPartition, false, adminRequest);
    response = sendRequestGetResponse(blobStoreControlAdminRequest, ServerErrorCode.Partition_Unknown);
    assertTrue("Response not of type AdminResponse", response instanceof AdminResponse);
    // test validate request failure
    storageManager.returnNullStore = true;
    adminRequest = new AdminRequest(AdminRequestOrResponseType.BlobStoreControl, id, correlationId, clientId);
    blobStoreControlAdminRequest = new BlobStoreControlAdminRequest(numReplicasCaughtUpPerPartition, false, adminRequest);
    response = sendRequestGetResponse(blobStoreControlAdminRequest, ServerErrorCode.Disk_Unavailable);
    assertTrue("Response not of type AdminResponse", response instanceof AdminResponse);
    storageManager.returnNullStore = false;
    // test disable compaction failure
    adminRequest = new AdminRequest(AdminRequestOrResponseType.BlobStoreControl, id, correlationId, clientId);
    blobStoreControlAdminRequest = new BlobStoreControlAdminRequest(numReplicasCaughtUpPerPartition, false, adminRequest);
    storageManager.returnValueOfDisablingCompaction = false;
    response = sendRequestGetResponse(blobStoreControlAdminRequest, ServerErrorCode.Unknown_Error);
    assertTrue("Response not of type AdminResponse", response instanceof AdminResponse);
    storageManager.returnValueOfDisablingCompaction = true;
    // test disable compaction with runtime exception
    adminRequest = new AdminRequest(AdminRequestOrResponseType.BlobStoreControl, id, correlationId, clientId);
    blobStoreControlAdminRequest = new BlobStoreControlAdminRequest(numReplicasCaughtUpPerPartition, false, adminRequest);
    storageManager.exceptionToThrowOnDisablingCompaction = new IllegalStateException();
    response = sendRequestGetResponse(blobStoreControlAdminRequest, ServerErrorCode.Unknown_Error);
    assertTrue("Response not of type AdminResponse", response instanceof AdminResponse);
    storageManager.exceptionToThrowOnDisablingCompaction = null;
    // test disable replication failure
    replicationManager.reset();
    replicationManager.controlReplicationReturnVal = false;
    adminRequest = new AdminRequest(AdminRequestOrResponseType.BlobStoreControl, id, correlationId, clientId);
    blobStoreControlAdminRequest = new BlobStoreControlAdminRequest(numReplicasCaughtUpPerPartition, false, adminRequest);
    response = sendRequestGetResponse(blobStoreControlAdminRequest, ServerErrorCode.Unknown_Error);
    assertTrue("Response not of type AdminResponse", response instanceof AdminResponse);
    // test peers catchup failure
    replicationManager.reset();
    replicationManager.controlReplicationReturnVal = true;
    // all replicas of this partition > acceptableLag
    generateLagOverrides(1, 1);
    adminRequest = new AdminRequest(AdminRequestOrResponseType.BlobStoreControl, id, correlationId, clientId);
    blobStoreControlAdminRequest = new BlobStoreControlAdminRequest(numReplicasCaughtUpPerPartition, false, adminRequest);
    response = sendRequestGetResponse(blobStoreControlAdminRequest, ServerErrorCode.Retry_After_Backoff);
    assertTrue("Response not of type AdminResponse", response instanceof AdminResponse);
    // test shutdown BlobStore failure
    replicationManager.reset();
    replicationManager.controlReplicationReturnVal = true;
    storageManager.returnValueOfShutdownBlobStore = false;
    generateLagOverrides(0, 0);
    adminRequest = new AdminRequest(AdminRequestOrResponseType.BlobStoreControl, id, correlationId, clientId);
    blobStoreControlAdminRequest = new BlobStoreControlAdminRequest(numReplicasCaughtUpPerPartition, false, adminRequest);
    response = sendRequestGetResponse(blobStoreControlAdminRequest, ServerErrorCode.Unknown_Error);
    assertTrue("Response not of type AdminResponse", response instanceof AdminResponse);
    // test shutdown BlobStore with runtime exception
    storageManager.exceptionToThrowOnShuttingdownBlobStore = new IllegalStateException();
    adminRequest = new AdminRequest(AdminRequestOrResponseType.BlobStoreControl, id, correlationId, clientId);
    blobStoreControlAdminRequest = new BlobStoreControlAdminRequest(numReplicasCaughtUpPerPartition, false, adminRequest);
    response = sendRequestGetResponse(blobStoreControlAdminRequest, ServerErrorCode.Unknown_Error);
    assertTrue("Response not of type AdminResponse", response instanceof AdminResponse);
    storageManager.exceptionToThrowOnShuttingdownBlobStore = null;
}
Also used : CatchupStatusAdminRequest(com.github.ambry.protocol.CatchupStatusAdminRequest) AdminRequest(com.github.ambry.protocol.AdminRequest) ReplicationControlAdminRequest(com.github.ambry.protocol.ReplicationControlAdminRequest) RequestControlAdminRequest(com.github.ambry.protocol.RequestControlAdminRequest) BlobStoreControlAdminRequest(com.github.ambry.protocol.BlobStoreControlAdminRequest) CatchupStatusAdminResponse(com.github.ambry.protocol.CatchupStatusAdminResponse) GetResponse(com.github.ambry.protocol.GetResponse) ReplicaMetadataResponse(com.github.ambry.protocol.ReplicaMetadataResponse) AdminResponse(com.github.ambry.protocol.AdminResponse) RequestOrResponse(com.github.ambry.protocol.RequestOrResponse) Response(com.github.ambry.protocol.Response) CatchupStatusAdminResponse(com.github.ambry.protocol.CatchupStatusAdminResponse) AdminResponse(com.github.ambry.protocol.AdminResponse) BlobStoreControlAdminRequest(com.github.ambry.protocol.BlobStoreControlAdminRequest) PartitionId(com.github.ambry.clustermap.PartitionId) UtilsTest(com.github.ambry.utils.UtilsTest) Test(org.junit.Test)

Example 8 with PartitionId

use of com.github.ambry.clustermap.PartitionId in project ambry by linkedin.

the class AmbryRequestsTest method catchupStatusSuccessTest.

/**
 * Tests for the response received on a {@link CatchupStatusAdminRequest} for different cases
 * @throws InterruptedException
 * @throws IOException
 */
@Test
public void catchupStatusSuccessTest() throws InterruptedException, IOException {
    List<? extends PartitionId> partitionIds = clusterMap.getAllPartitionIds();
    assertTrue("This test needs more than one partition to work", partitionIds.size() > 1);
    PartitionId id = partitionIds.get(0);
    ReplicaId thisPartRemoteRep = getRemoteReplicaId(id);
    ReplicaId otherPartRemoteRep = getRemoteReplicaId(partitionIds.get(1));
    List<? extends ReplicaId> replicaIds = id.getReplicaIds();
    assertTrue("This test needs more than one replica for the first partition to work", replicaIds.size() > 1);
    long acceptableLagInBytes = 100;
    // cases with a given partition id
    // all replicas of given partition < acceptableLag
    generateLagOverrides(0, acceptableLagInBytes - 1);
    doCatchupStatusTest(id, acceptableLagInBytes, Short.MAX_VALUE, ServerErrorCode.No_Error, true);
    // all replicas of given partition = acceptableLag
    generateLagOverrides(acceptableLagInBytes, acceptableLagInBytes);
    doCatchupStatusTest(id, acceptableLagInBytes, Short.MAX_VALUE, ServerErrorCode.No_Error, true);
    // 1 replica of some other partition > acceptableLag
    String key = MockReplicationManager.getPartitionLagKey(otherPartRemoteRep.getPartitionId(), otherPartRemoteRep.getDataNodeId().getHostname(), otherPartRemoteRep.getReplicaPath());
    replicationManager.lagOverrides.put(key, acceptableLagInBytes + 1);
    doCatchupStatusTest(id, acceptableLagInBytes, Short.MAX_VALUE, ServerErrorCode.No_Error, true);
    // 1 replica of this partition > acceptableLag
    key = MockReplicationManager.getPartitionLagKey(id, thisPartRemoteRep.getDataNodeId().getHostname(), thisPartRemoteRep.getReplicaPath());
    replicationManager.lagOverrides.put(key, acceptableLagInBytes + 1);
    doCatchupStatusTest(id, acceptableLagInBytes, Short.MAX_VALUE, ServerErrorCode.No_Error, false);
    // same result if num expected replicas == total count -1.
    doCatchupStatusTest(id, acceptableLagInBytes, (short) (replicaIds.size() - 1), ServerErrorCode.No_Error, false);
    // caught up if num expected replicas == total count - 2
    doCatchupStatusTest(id, acceptableLagInBytes, (short) (replicaIds.size() - 2), ServerErrorCode.No_Error, true);
    // caught up if num expected replicas == total count - 3
    doCatchupStatusTest(id, acceptableLagInBytes, (short) (replicaIds.size() - 3), ServerErrorCode.No_Error, true);
    // all replicas of this partition > acceptableLag
    generateLagOverrides(acceptableLagInBytes + 1, acceptableLagInBytes + 1);
    doCatchupStatusTest(id, acceptableLagInBytes, Short.MAX_VALUE, ServerErrorCode.No_Error, false);
    // cases with no partition id provided
    // all replicas of all partitions < acceptableLag
    generateLagOverrides(0, acceptableLagInBytes - 1);
    doCatchupStatusTest(null, acceptableLagInBytes, Short.MAX_VALUE, ServerErrorCode.No_Error, true);
    // all replicas of all partitions = acceptableLag
    generateLagOverrides(acceptableLagInBytes, acceptableLagInBytes);
    doCatchupStatusTest(null, acceptableLagInBytes, Short.MAX_VALUE, ServerErrorCode.No_Error, true);
    // 1 replica of one partition > acceptableLag
    key = MockReplicationManager.getPartitionLagKey(id, thisPartRemoteRep.getDataNodeId().getHostname(), thisPartRemoteRep.getReplicaPath());
    replicationManager.lagOverrides.put(key, acceptableLagInBytes + 1);
    doCatchupStatusTest(null, acceptableLagInBytes, Short.MAX_VALUE, ServerErrorCode.No_Error, false);
    // same result if num expected replicas == total count -1.
    doCatchupStatusTest(null, acceptableLagInBytes, (short) (replicaIds.size() - 1), ServerErrorCode.No_Error, false);
    // caught up if num expected replicas == total count - 2
    doCatchupStatusTest(null, acceptableLagInBytes, (short) (replicaIds.size() - 2), ServerErrorCode.No_Error, true);
    // caught up if num expected replicas == total count - 3
    doCatchupStatusTest(null, acceptableLagInBytes, (short) (replicaIds.size() - 3), ServerErrorCode.No_Error, true);
    // all replicas of all partitions > acceptableLag
    generateLagOverrides(acceptableLagInBytes + 1, acceptableLagInBytes + 1);
    doCatchupStatusTest(null, acceptableLagInBytes, Short.MAX_VALUE, ServerErrorCode.No_Error, false);
}
Also used : PartitionId(com.github.ambry.clustermap.PartitionId) ReplicaId(com.github.ambry.clustermap.ReplicaId) UtilsTest(com.github.ambry.utils.UtilsTest) Test(org.junit.Test)

Example 9 with PartitionId

use of com.github.ambry.clustermap.PartitionId in project ambry by linkedin.

the class AmbryRequestsTest method controlReplicationSuccessTest.

/**
 * Tests that {@link AdminRequestOrResponseType#ReplicationControl} works correctly.
 * @throws InterruptedException
 * @throws IOException
 */
@Test
public void controlReplicationSuccessTest() throws InterruptedException, IOException {
    List<? extends PartitionId> partitionIds = clusterMap.getWritablePartitionIds();
    for (PartitionId id : partitionIds) {
        doControlReplicationTest(id, ServerErrorCode.No_Error);
    }
    doControlReplicationTest(null, ServerErrorCode.No_Error);
}
Also used : PartitionId(com.github.ambry.clustermap.PartitionId) UtilsTest(com.github.ambry.utils.UtilsTest) Test(org.junit.Test)

Example 10 with PartitionId

use of com.github.ambry.clustermap.PartitionId in project ambry by linkedin.

the class AmbryRequestsTest method generateLagOverrides.

/**
 * Generates lag overrides in {@code replicationManager} with each lag a number between {@code base} and
 * {@code upperBound} both inclusive.
 * @param base the minimum value of lag (inclusive)
 * @param upperBound the maximum value of lag (inclusive)
 */
private void generateLagOverrides(long base, long upperBound) {
    replicationManager.lagOverrides = new HashMap<>();
    for (PartitionId partitionId : clusterMap.getAllPartitionIds()) {
        for (ReplicaId replicaId : partitionId.getReplicaIds()) {
            String key = MockReplicationManager.getPartitionLagKey(partitionId, replicaId.getDataNodeId().getHostname(), replicaId.getReplicaPath());
            Long value = base + Utils.getRandomLong(TestUtils.RANDOM, upperBound - base + 1);
            replicationManager.lagOverrides.put(key, value);
        }
    }
}
Also used : PartitionId(com.github.ambry.clustermap.PartitionId) ReplicaId(com.github.ambry.clustermap.ReplicaId)

Aggregations

PartitionId (com.github.ambry.clustermap.PartitionId)183 MockPartitionId (com.github.ambry.clustermap.MockPartitionId)111 Test (org.junit.Test)95 ReplicaId (com.github.ambry.clustermap.ReplicaId)70 ArrayList (java.util.ArrayList)68 MockClusterMap (com.github.ambry.clustermap.MockClusterMap)53 BlobId (com.github.ambry.commons.BlobId)50 HashMap (java.util.HashMap)48 Map (java.util.Map)41 List (java.util.List)40 MockDataNodeId (com.github.ambry.clustermap.MockDataNodeId)39 DataNodeId (com.github.ambry.clustermap.DataNodeId)36 MetricRegistry (com.codahale.metrics.MetricRegistry)33 ClusterMap (com.github.ambry.clustermap.ClusterMap)32 MockReplicaId (com.github.ambry.clustermap.MockReplicaId)30 VerifiableProperties (com.github.ambry.config.VerifiableProperties)30 IOException (java.io.IOException)29 HashSet (java.util.HashSet)29 StoreKey (com.github.ambry.store.StoreKey)26 StoreKeyFactory (com.github.ambry.store.StoreKeyFactory)25