Search in sources :

Example 81 with ReplicaId

use of com.github.ambry.clustermap.ReplicaId in project ambry by linkedin.

the class StorageManagerTest method addBlobStoreTest.

/**
 * Test add new BlobStore with given {@link ReplicaId}.
 */
@Test
public void addBlobStoreTest() throws Exception {
    generateConfigs(true, false);
    MockDataNodeId localNode = clusterMap.getDataNodes().get(0);
    List<ReplicaId> localReplicas = clusterMap.getReplicaIds(localNode);
    int newMountPathIndex = 3;
    // add new MountPath to local node
    File f = File.createTempFile("ambry", ".tmp");
    File mountFile = new File(f.getParent(), "mountpathfile" + MockClusterMap.PLAIN_TEXT_PORT_START_NUMBER + newMountPathIndex);
    MockClusterMap.deleteFileOrDirectory(mountFile);
    assertTrue("Couldn't create mount path directory", mountFile.mkdir());
    localNode.addMountPaths(Collections.singletonList(mountFile.getAbsolutePath()));
    PartitionId newPartition1 = new MockPartitionId(10L, MockClusterMap.DEFAULT_PARTITION_CLASS, clusterMap.getDataNodes(), newMountPathIndex);
    StorageManager storageManager = createStorageManager(localNode, metricRegistry, null);
    storageManager.start();
    // test add store that already exists, which should fail
    assertFalse("Add store which is already existing should fail", storageManager.addBlobStore(localReplicas.get(0)));
    // test add store onto a new disk, which should succeed
    assertTrue("Add new store should succeed", storageManager.addBlobStore(newPartition1.getReplicaIds().get(0)));
    assertNotNull("The store shouldn't be null because new store is successfully added", storageManager.getStore(newPartition1, false));
    // test add store whose diskManager is not running, which should fail
    PartitionId newPartition2 = new MockPartitionId(11L, MockClusterMap.DEFAULT_PARTITION_CLASS, clusterMap.getDataNodes(), 0);
    storageManager.getDiskManager(localReplicas.get(0).getPartitionId()).shutdown();
    assertFalse("Add store onto the DiskManager which is not running should fail", storageManager.addBlobStore(newPartition2.getReplicaIds().get(0)));
    storageManager.getDiskManager(localReplicas.get(0).getPartitionId()).start();
    // test replica addition can correctly handle existing dir (should delete it and create a new one)
    // To verify the directory has been recreated, we purposely put a test file in previous dir.
    PartitionId newPartition3 = new MockPartitionId(12L, MockClusterMap.DEFAULT_PARTITION_CLASS, clusterMap.getDataNodes(), 0);
    ReplicaId replicaToAdd = newPartition3.getReplicaIds().get(0);
    File previousDir = new File(replicaToAdd.getReplicaPath());
    File testFile = new File(previousDir, "testFile");
    MockClusterMap.deleteFileOrDirectory(previousDir);
    assertTrue("Cannot create dir for " + replicaToAdd.getReplicaPath(), previousDir.mkdir());
    assertTrue("Cannot create test file within previous dir", testFile.createNewFile());
    assertTrue("Adding new store should succeed", storageManager.addBlobStore(replicaToAdd));
    assertFalse("Test file should not exist", testFile.exists());
    assertNotNull("Store associated new added replica should not be null", storageManager.getStore(newPartition3, false));
    shutdownAndAssertStoresInaccessible(storageManager, localReplicas);
    // test add store but fail to add segment requirements to DiskSpaceAllocator. (This is simulated by inducing
    // addRequiredSegments failure to make store inaccessible)
    List<String> mountPaths = localNode.getMountPaths();
    String diskToFail = mountPaths.get(0);
    File reservePoolDir = new File(diskToFail, diskManagerConfig.diskManagerReserveFileDirName);
    File storeReserveDir = new File(reservePoolDir, DiskSpaceAllocator.STORE_DIR_PREFIX + newPartition2.toPathString());
    StorageManager storageManager2 = createStorageManager(localNode, new MetricRegistry(), null);
    storageManager2.start();
    Utils.deleteFileOrDirectory(storeReserveDir);
    assertTrue("File creation should succeed", storeReserveDir.createNewFile());
    assertFalse("Add store should fail if store couldn't start due to initializePool failure", storageManager2.addBlobStore(newPartition2.getReplicaIds().get(0)));
    assertNull("New store shouldn't be in in-memory data structure", storageManager2.getStore(newPartition2, false));
    shutdownAndAssertStoresInaccessible(storageManager2, localReplicas);
}
Also used : MockPartitionId(com.github.ambry.clustermap.MockPartitionId) MetricRegistry(com.codahale.metrics.MetricRegistry) MockDataNodeId(com.github.ambry.clustermap.MockDataNodeId) MockPartitionId(com.github.ambry.clustermap.MockPartitionId) PartitionId(com.github.ambry.clustermap.PartitionId) File(java.io.File) ReplicaId(com.github.ambry.clustermap.ReplicaId) BlobStoreTest(com.github.ambry.store.BlobStoreTest) Test(org.junit.Test)

Example 82 with ReplicaId

use of com.github.ambry.clustermap.ReplicaId in project ambry by linkedin.

the class AmbryRequests method isRemoteLagLesserOrEqual.

/**
 * Provides catch up status of all the remote replicas of {@code partitionIds}.
 * @param partitionIds the {@link PartitionId}s for which lag has to be <= {@code acceptableLagInBytes}.
 * @param acceptableLagInBytes the maximum lag in bytes that is considered "acceptable".
 * @param numReplicasCaughtUpPerPartition the number of replicas that have to be within {@code acceptableLagInBytes}
 *                                        (per partition). The min of this value or the total count of replicas - 1 is
 *                                        considered.
 * @return {@code true} if the lag of each of the remote replicas of each of the {@link PartitionId} in
 * {@code partitionIds} <= {@code acceptableLagInBytes}. {@code false} otherwise.
 */
private boolean isRemoteLagLesserOrEqual(Collection<PartitionId> partitionIds, long acceptableLagInBytes, short numReplicasCaughtUpPerPartition) {
    boolean isAcceptable = true;
    for (PartitionId partitionId : partitionIds) {
        List<? extends ReplicaId> replicaIds = partitionId.getReplicaIds();
        int caughtUpCount = 0;
        for (ReplicaId replicaId : replicaIds) {
            if (!replicaId.getDataNodeId().equals(currentNode)) {
                long lagInBytes = replicationManager.getRemoteReplicaLagFromLocalInBytes(partitionId, replicaId.getDataNodeId().getHostname(), replicaId.getReplicaPath());
                logger.debug("Lag of {} is {}", replicaId, lagInBytes);
                if (lagInBytes <= acceptableLagInBytes) {
                    caughtUpCount++;
                }
                if (caughtUpCount >= numReplicasCaughtUpPerPartition) {
                    break;
                }
            }
        }
        // -1 because we shouldn't consider the replica hosted on this node.
        if (caughtUpCount < Math.min(replicaIds.size() - 1, numReplicasCaughtUpPerPartition)) {
            isAcceptable = false;
            break;
        }
    }
    return isAcceptable;
}
Also used : PartitionId(com.github.ambry.clustermap.PartitionId) ReplicaId(com.github.ambry.clustermap.ReplicaId)

Example 83 with ReplicaId

use of com.github.ambry.clustermap.ReplicaId in project ambry by linkedin.

the class AmbryRequestsTest method scheduleCompactionFailureTest.

/**
 * Tests failure scenarios for compaction - disk down, store not scheduled for compaction, exception while scheduling.
 * @throws InterruptedException
 * @throws IOException
 */
@Test
public void scheduleCompactionFailureTest() throws InterruptedException, IOException {
    // partitionId not specified
    doScheduleCompactionTest(null, ServerErrorCode.Bad_Request);
    PartitionId id = clusterMap.getWritablePartitionIds().get(0);
    // store is not started - Disk_Unavailable
    storageManager.returnNullStore = true;
    doScheduleCompactionTest(id, ServerErrorCode.Disk_Unavailable);
    storageManager.returnNullStore = false;
    // PartitionUnknown is hard to simulate without betraying knowledge of the internals of MockClusterMap.
    // disk unavailable
    ReplicaId replicaId = null;
    for (ReplicaId replica : id.getReplicaIds()) {
        if (replica.getDataNodeId().equals(dataNodeId)) {
            replicaId = replica;
            break;
        }
    }
    assertNotNull("Should have found a replicaId", replicaId);
    clusterMap.onReplicaEvent(replicaId, ReplicaEventType.Disk_Error);
    doScheduleCompactionTest(id, ServerErrorCode.Disk_Unavailable);
    clusterMap.onReplicaEvent(replicaId, ReplicaEventType.Disk_Ok);
    // store cannot be scheduled for compaction - Unknown_Error
    storageManager.returnValueOfSchedulingCompaction = false;
    doScheduleCompactionTest(id, ServerErrorCode.Unknown_Error);
    storageManager.returnValueOfSchedulingCompaction = true;
    // exception while attempting to schedule - InternalServerError
    storageManager.exceptionToThrowOnSchedulingCompaction = new IllegalStateException();
    doScheduleCompactionTest(id, ServerErrorCode.Unknown_Error);
    storageManager.exceptionToThrowOnSchedulingCompaction = null;
}
Also used : PartitionId(com.github.ambry.clustermap.PartitionId) ReplicaId(com.github.ambry.clustermap.ReplicaId) UtilsTest(com.github.ambry.utils.UtilsTest) Test(org.junit.Test)

Example 84 with ReplicaId

use of com.github.ambry.clustermap.ReplicaId in project ambry by linkedin.

the class NonBlockingRouterTest method testFailureDetectorNotification.

/**
 * Test that failure detector is correctly notified for all responses regardless of the order in which successful
 * and failed responses arrive.
 * @param opHelper the {@link OperationHelper}
 * @param networkClient the {@link NetworkClient}
 * @param failedReplicaIds the list that will contain all the replicas for which failure was notified.
 * @param blobId the id of the blob to get/delete. For puts, this will be null.
 * @param successfulResponseCount the AtomicInteger that will contain the count of replicas for which success was
 *                                notified.
 * @param invalidResponse the AtomicBoolean that will contain whether an unexpected failure was notified.
 * @param indexToFail if greater than 0, the index representing which response for which failure is to be simulated.
 *                    For example, if index is 0, then the first response will be failed.
 *                    If the index is -1, no responses will be failed, and successful responses will be returned to
 *                    the operation managers.
 */
private void testFailureDetectorNotification(OperationHelper opHelper, NetworkClient networkClient, List<ReplicaId> failedReplicaIds, BlobId blobId, AtomicInteger successfulResponseCount, AtomicBoolean invalidResponse, int indexToFail) throws Exception {
    failedReplicaIds.clear();
    successfulResponseCount.set(0);
    invalidResponse.set(false);
    mockSelectorState.set(MockSelectorState.Good);
    FutureResult futureResult = opHelper.submitOperation(blobId);
    int requestParallelism = opHelper.requestParallelism;
    List<RequestInfo> allRequests = new ArrayList<>();
    long loopStartTimeMs = SystemTime.getInstance().milliseconds();
    while (allRequests.size() < requestParallelism) {
        if (loopStartTimeMs + AWAIT_TIMEOUT_MS < SystemTime.getInstance().milliseconds()) {
            Assert.fail("Waited too long for requests.");
        }
        opHelper.pollOpManager(allRequests);
    }
    ReplicaId replicaIdToFail = indexToFail == -1 ? null : ((RouterRequestInfo) allRequests.get(indexToFail)).getReplicaId();
    for (RequestInfo requestInfo : allRequests) {
        ResponseInfo responseInfo;
        if (replicaIdToFail != null && replicaIdToFail.equals(((RouterRequestInfo) requestInfo).getReplicaId())) {
            responseInfo = new ResponseInfo(requestInfo, NetworkClientErrorCode.NetworkError, null);
        } else {
            List<RequestInfo> requestInfoListToSend = new ArrayList<>();
            requestInfoListToSend.add(requestInfo);
            List<ResponseInfo> responseInfoList;
            loopStartTimeMs = SystemTime.getInstance().milliseconds();
            do {
                if (loopStartTimeMs + AWAIT_TIMEOUT_MS < SystemTime.getInstance().milliseconds()) {
                    Assert.fail("Waited too long for the response.");
                }
                responseInfoList = networkClient.sendAndPoll(requestInfoListToSend, 10);
                requestInfoListToSend.clear();
            } while (responseInfoList.size() == 0);
            responseInfo = responseInfoList.get(0);
        }
        opHelper.handleResponse(responseInfo);
    }
    // Poll once again so that the operation gets a chance to complete.
    allRequests.clear();
    if (testEncryption) {
        opHelper.awaitOpCompletionOrTimeOut(futureResult);
    } else {
        opHelper.pollOpManager(allRequests);
    }
    futureResult.get(AWAIT_TIMEOUT_MS, TimeUnit.MILLISECONDS);
    if (indexToFail == -1) {
        Assert.assertEquals("Successful notification should have arrived for replicas that were up", opHelper.requestParallelism, successfulResponseCount.get());
        Assert.assertEquals("Failure detector should not have been notified", 0, failedReplicaIds.size());
        Assert.assertFalse("There should be no notifications of any other kind", invalidResponse.get());
    } else {
        Assert.assertEquals("Failure detector should have been notified", 1, failedReplicaIds.size());
        Assert.assertEquals("Failed notification should have arrived for the failed replica", replicaIdToFail, failedReplicaIds.get(0));
        Assert.assertEquals("Successful notification should have arrived for replicas that were up", opHelper.requestParallelism - 1, successfulResponseCount.get());
        Assert.assertFalse("There should be no notifications of any other kind", invalidResponse.get());
    }
}
Also used : ResponseInfo(com.github.ambry.network.ResponseInfo) ArrayList(java.util.ArrayList) RequestInfo(com.github.ambry.network.RequestInfo) ReplicaId(com.github.ambry.clustermap.ReplicaId)

Example 85 with ReplicaId

use of com.github.ambry.clustermap.ReplicaId in project ambry by linkedin.

the class VcrReplicationManager method addReplica.

/**
 * Add a replica of given {@link PartitionId} and its {@link RemoteReplicaInfo}s to backup list.
 * @param partitionId the {@link PartitionId} of the replica to add.
 * @throws ReplicationException if replicas initialization failed.
 */
void addReplica(PartitionId partitionId) throws ReplicationException {
    if (partitionToPartitionInfo.containsKey(partitionId)) {
        throw new ReplicationException("Partition " + partitionId + " already exists on " + dataNodeId);
    }
    ReplicaId cloudReplica = new CloudReplica(partitionId, vcrClusterParticipant.getCurrentDataNodeId());
    if (!storeManager.addBlobStore(cloudReplica)) {
        logger.error("Can't start cloudstore for replica {}", cloudReplica);
        throw new ReplicationException("Can't start cloudstore for replica " + cloudReplica);
    }
    List<? extends ReplicaId> peerReplicas = cloudReplica.getPeerReplicaIds();
    List<RemoteReplicaInfo> remoteReplicaInfos = new ArrayList<>();
    Store store = storeManager.getStore(partitionId);
    if (peerReplicas != null) {
        for (ReplicaId peerReplica : peerReplicas) {
            if (!shouldReplicateFromDc(peerReplica.getDataNodeId().getDatacenterName())) {
                continue;
            }
            // We need to ensure that a replica token gets persisted only after the corresponding data in the
            // store gets flushed to cloud. We use the store flush interval multiplied by a constant factor
            // to determine the token flush interval
            FindTokenFactory findTokenFactory = tokenHelper.getFindTokenFactoryFromReplicaType(peerReplica.getReplicaType());
            RemoteReplicaInfo remoteReplicaInfo = new RemoteReplicaInfo(peerReplica, cloudReplica, store, findTokenFactory.getNewFindToken(), storeConfig.storeDataFlushIntervalSeconds * SystemTime.MsPerSec * Replication_Delay_Multiplier, SystemTime.getInstance(), peerReplica.getDataNodeId().getPortToConnectTo());
            replicationMetrics.addMetricsForRemoteReplicaInfo(remoteReplicaInfo, trackPerDatacenterLagInMetric);
            remoteReplicaInfos.add(remoteReplicaInfo);
        }
        rwLock.writeLock().lock();
        try {
            updatePartitionInfoMaps(remoteReplicaInfos, cloudReplica);
            partitionStoreMap.put(partitionId.toPathString(), store);
            // Reload replication token if exist.
            int tokenReloadFailCount = reloadReplicationTokenIfExists(cloudReplica, remoteReplicaInfos);
            vcrMetrics.tokenReloadWarnCount.inc(tokenReloadFailCount);
            // Add remoteReplicaInfos to {@link ReplicaThread}.
            addRemoteReplicaInfoToReplicaThread(remoteReplicaInfos, true);
            if (replicationConfig.replicationTrackPerPartitionLagFromRemote) {
                replicationMetrics.addLagMetricForPartition(partitionId, true);
            }
        } finally {
            rwLock.writeLock().unlock();
        }
    } else {
        try {
            storeManager.shutdownBlobStore(partitionId);
            storeManager.removeBlobStore(partitionId);
        } finally {
            throw new ReplicationException("Failed to add Partition " + partitionId + " on " + dataNodeId + " , because no peer replicas found.");
        }
    }
}
Also used : CloudReplica(com.github.ambry.clustermap.CloudReplica) RemoteReplicaInfo(com.github.ambry.replication.RemoteReplicaInfo) ArrayList(java.util.ArrayList) Store(com.github.ambry.store.Store) ReplicationException(com.github.ambry.replication.ReplicationException) FindTokenFactory(com.github.ambry.replication.FindTokenFactory) ReplicaId(com.github.ambry.clustermap.ReplicaId)

Aggregations

ReplicaId (com.github.ambry.clustermap.ReplicaId)147 Test (org.junit.Test)83 PartitionId (com.github.ambry.clustermap.PartitionId)68 MockPartitionId (com.github.ambry.clustermap.MockPartitionId)60 MockReplicaId (com.github.ambry.clustermap.MockReplicaId)57 ArrayList (java.util.ArrayList)55 MockDataNodeId (com.github.ambry.clustermap.MockDataNodeId)43 DataNodeId (com.github.ambry.clustermap.DataNodeId)32 MockClusterMap (com.github.ambry.clustermap.MockClusterMap)31 MetricRegistry (com.codahale.metrics.MetricRegistry)29 HashMap (java.util.HashMap)28 HashSet (java.util.HashSet)25 ClusterMapConfig (com.github.ambry.config.ClusterMapConfig)24 VerifiableProperties (com.github.ambry.config.VerifiableProperties)24 BlobStoreTest (com.github.ambry.store.BlobStoreTest)24 File (java.io.File)24 List (java.util.List)21 Map (java.util.Map)21 Port (com.github.ambry.network.Port)20 Properties (java.util.Properties)20