use of com.github.ambry.clustermap.ReplicaId in project ambry by linkedin.
the class StorageManagerTest method addBlobStoreTest.
/**
* Test add new BlobStore with given {@link ReplicaId}.
*/
@Test
public void addBlobStoreTest() throws Exception {
generateConfigs(true, false);
MockDataNodeId localNode = clusterMap.getDataNodes().get(0);
List<ReplicaId> localReplicas = clusterMap.getReplicaIds(localNode);
int newMountPathIndex = 3;
// add new MountPath to local node
File f = File.createTempFile("ambry", ".tmp");
File mountFile = new File(f.getParent(), "mountpathfile" + MockClusterMap.PLAIN_TEXT_PORT_START_NUMBER + newMountPathIndex);
MockClusterMap.deleteFileOrDirectory(mountFile);
assertTrue("Couldn't create mount path directory", mountFile.mkdir());
localNode.addMountPaths(Collections.singletonList(mountFile.getAbsolutePath()));
PartitionId newPartition1 = new MockPartitionId(10L, MockClusterMap.DEFAULT_PARTITION_CLASS, clusterMap.getDataNodes(), newMountPathIndex);
StorageManager storageManager = createStorageManager(localNode, metricRegistry, null);
storageManager.start();
// test add store that already exists, which should fail
assertFalse("Add store which is already existing should fail", storageManager.addBlobStore(localReplicas.get(0)));
// test add store onto a new disk, which should succeed
assertTrue("Add new store should succeed", storageManager.addBlobStore(newPartition1.getReplicaIds().get(0)));
assertNotNull("The store shouldn't be null because new store is successfully added", storageManager.getStore(newPartition1, false));
// test add store whose diskManager is not running, which should fail
PartitionId newPartition2 = new MockPartitionId(11L, MockClusterMap.DEFAULT_PARTITION_CLASS, clusterMap.getDataNodes(), 0);
storageManager.getDiskManager(localReplicas.get(0).getPartitionId()).shutdown();
assertFalse("Add store onto the DiskManager which is not running should fail", storageManager.addBlobStore(newPartition2.getReplicaIds().get(0)));
storageManager.getDiskManager(localReplicas.get(0).getPartitionId()).start();
// test replica addition can correctly handle existing dir (should delete it and create a new one)
// To verify the directory has been recreated, we purposely put a test file in previous dir.
PartitionId newPartition3 = new MockPartitionId(12L, MockClusterMap.DEFAULT_PARTITION_CLASS, clusterMap.getDataNodes(), 0);
ReplicaId replicaToAdd = newPartition3.getReplicaIds().get(0);
File previousDir = new File(replicaToAdd.getReplicaPath());
File testFile = new File(previousDir, "testFile");
MockClusterMap.deleteFileOrDirectory(previousDir);
assertTrue("Cannot create dir for " + replicaToAdd.getReplicaPath(), previousDir.mkdir());
assertTrue("Cannot create test file within previous dir", testFile.createNewFile());
assertTrue("Adding new store should succeed", storageManager.addBlobStore(replicaToAdd));
assertFalse("Test file should not exist", testFile.exists());
assertNotNull("Store associated new added replica should not be null", storageManager.getStore(newPartition3, false));
shutdownAndAssertStoresInaccessible(storageManager, localReplicas);
// test add store but fail to add segment requirements to DiskSpaceAllocator. (This is simulated by inducing
// addRequiredSegments failure to make store inaccessible)
List<String> mountPaths = localNode.getMountPaths();
String diskToFail = mountPaths.get(0);
File reservePoolDir = new File(diskToFail, diskManagerConfig.diskManagerReserveFileDirName);
File storeReserveDir = new File(reservePoolDir, DiskSpaceAllocator.STORE_DIR_PREFIX + newPartition2.toPathString());
StorageManager storageManager2 = createStorageManager(localNode, new MetricRegistry(), null);
storageManager2.start();
Utils.deleteFileOrDirectory(storeReserveDir);
assertTrue("File creation should succeed", storeReserveDir.createNewFile());
assertFalse("Add store should fail if store couldn't start due to initializePool failure", storageManager2.addBlobStore(newPartition2.getReplicaIds().get(0)));
assertNull("New store shouldn't be in in-memory data structure", storageManager2.getStore(newPartition2, false));
shutdownAndAssertStoresInaccessible(storageManager2, localReplicas);
}
use of com.github.ambry.clustermap.ReplicaId in project ambry by linkedin.
the class AmbryRequests method isRemoteLagLesserOrEqual.
/**
* Provides catch up status of all the remote replicas of {@code partitionIds}.
* @param partitionIds the {@link PartitionId}s for which lag has to be <= {@code acceptableLagInBytes}.
* @param acceptableLagInBytes the maximum lag in bytes that is considered "acceptable".
* @param numReplicasCaughtUpPerPartition the number of replicas that have to be within {@code acceptableLagInBytes}
* (per partition). The min of this value or the total count of replicas - 1 is
* considered.
* @return {@code true} if the lag of each of the remote replicas of each of the {@link PartitionId} in
* {@code partitionIds} <= {@code acceptableLagInBytes}. {@code false} otherwise.
*/
private boolean isRemoteLagLesserOrEqual(Collection<PartitionId> partitionIds, long acceptableLagInBytes, short numReplicasCaughtUpPerPartition) {
boolean isAcceptable = true;
for (PartitionId partitionId : partitionIds) {
List<? extends ReplicaId> replicaIds = partitionId.getReplicaIds();
int caughtUpCount = 0;
for (ReplicaId replicaId : replicaIds) {
if (!replicaId.getDataNodeId().equals(currentNode)) {
long lagInBytes = replicationManager.getRemoteReplicaLagFromLocalInBytes(partitionId, replicaId.getDataNodeId().getHostname(), replicaId.getReplicaPath());
logger.debug("Lag of {} is {}", replicaId, lagInBytes);
if (lagInBytes <= acceptableLagInBytes) {
caughtUpCount++;
}
if (caughtUpCount >= numReplicasCaughtUpPerPartition) {
break;
}
}
}
// -1 because we shouldn't consider the replica hosted on this node.
if (caughtUpCount < Math.min(replicaIds.size() - 1, numReplicasCaughtUpPerPartition)) {
isAcceptable = false;
break;
}
}
return isAcceptable;
}
use of com.github.ambry.clustermap.ReplicaId in project ambry by linkedin.
the class AmbryRequestsTest method scheduleCompactionFailureTest.
/**
* Tests failure scenarios for compaction - disk down, store not scheduled for compaction, exception while scheduling.
* @throws InterruptedException
* @throws IOException
*/
@Test
public void scheduleCompactionFailureTest() throws InterruptedException, IOException {
// partitionId not specified
doScheduleCompactionTest(null, ServerErrorCode.Bad_Request);
PartitionId id = clusterMap.getWritablePartitionIds().get(0);
// store is not started - Disk_Unavailable
storageManager.returnNullStore = true;
doScheduleCompactionTest(id, ServerErrorCode.Disk_Unavailable);
storageManager.returnNullStore = false;
// PartitionUnknown is hard to simulate without betraying knowledge of the internals of MockClusterMap.
// disk unavailable
ReplicaId replicaId = null;
for (ReplicaId replica : id.getReplicaIds()) {
if (replica.getDataNodeId().equals(dataNodeId)) {
replicaId = replica;
break;
}
}
assertNotNull("Should have found a replicaId", replicaId);
clusterMap.onReplicaEvent(replicaId, ReplicaEventType.Disk_Error);
doScheduleCompactionTest(id, ServerErrorCode.Disk_Unavailable);
clusterMap.onReplicaEvent(replicaId, ReplicaEventType.Disk_Ok);
// store cannot be scheduled for compaction - Unknown_Error
storageManager.returnValueOfSchedulingCompaction = false;
doScheduleCompactionTest(id, ServerErrorCode.Unknown_Error);
storageManager.returnValueOfSchedulingCompaction = true;
// exception while attempting to schedule - InternalServerError
storageManager.exceptionToThrowOnSchedulingCompaction = new IllegalStateException();
doScheduleCompactionTest(id, ServerErrorCode.Unknown_Error);
storageManager.exceptionToThrowOnSchedulingCompaction = null;
}
use of com.github.ambry.clustermap.ReplicaId in project ambry by linkedin.
the class NonBlockingRouterTest method testFailureDetectorNotification.
/**
* Test that failure detector is correctly notified for all responses regardless of the order in which successful
* and failed responses arrive.
* @param opHelper the {@link OperationHelper}
* @param networkClient the {@link NetworkClient}
* @param failedReplicaIds the list that will contain all the replicas for which failure was notified.
* @param blobId the id of the blob to get/delete. For puts, this will be null.
* @param successfulResponseCount the AtomicInteger that will contain the count of replicas for which success was
* notified.
* @param invalidResponse the AtomicBoolean that will contain whether an unexpected failure was notified.
* @param indexToFail if greater than 0, the index representing which response for which failure is to be simulated.
* For example, if index is 0, then the first response will be failed.
* If the index is -1, no responses will be failed, and successful responses will be returned to
* the operation managers.
*/
private void testFailureDetectorNotification(OperationHelper opHelper, NetworkClient networkClient, List<ReplicaId> failedReplicaIds, BlobId blobId, AtomicInteger successfulResponseCount, AtomicBoolean invalidResponse, int indexToFail) throws Exception {
failedReplicaIds.clear();
successfulResponseCount.set(0);
invalidResponse.set(false);
mockSelectorState.set(MockSelectorState.Good);
FutureResult futureResult = opHelper.submitOperation(blobId);
int requestParallelism = opHelper.requestParallelism;
List<RequestInfo> allRequests = new ArrayList<>();
long loopStartTimeMs = SystemTime.getInstance().milliseconds();
while (allRequests.size() < requestParallelism) {
if (loopStartTimeMs + AWAIT_TIMEOUT_MS < SystemTime.getInstance().milliseconds()) {
Assert.fail("Waited too long for requests.");
}
opHelper.pollOpManager(allRequests);
}
ReplicaId replicaIdToFail = indexToFail == -1 ? null : ((RouterRequestInfo) allRequests.get(indexToFail)).getReplicaId();
for (RequestInfo requestInfo : allRequests) {
ResponseInfo responseInfo;
if (replicaIdToFail != null && replicaIdToFail.equals(((RouterRequestInfo) requestInfo).getReplicaId())) {
responseInfo = new ResponseInfo(requestInfo, NetworkClientErrorCode.NetworkError, null);
} else {
List<RequestInfo> requestInfoListToSend = new ArrayList<>();
requestInfoListToSend.add(requestInfo);
List<ResponseInfo> responseInfoList;
loopStartTimeMs = SystemTime.getInstance().milliseconds();
do {
if (loopStartTimeMs + AWAIT_TIMEOUT_MS < SystemTime.getInstance().milliseconds()) {
Assert.fail("Waited too long for the response.");
}
responseInfoList = networkClient.sendAndPoll(requestInfoListToSend, 10);
requestInfoListToSend.clear();
} while (responseInfoList.size() == 0);
responseInfo = responseInfoList.get(0);
}
opHelper.handleResponse(responseInfo);
}
// Poll once again so that the operation gets a chance to complete.
allRequests.clear();
if (testEncryption) {
opHelper.awaitOpCompletionOrTimeOut(futureResult);
} else {
opHelper.pollOpManager(allRequests);
}
futureResult.get(AWAIT_TIMEOUT_MS, TimeUnit.MILLISECONDS);
if (indexToFail == -1) {
Assert.assertEquals("Successful notification should have arrived for replicas that were up", opHelper.requestParallelism, successfulResponseCount.get());
Assert.assertEquals("Failure detector should not have been notified", 0, failedReplicaIds.size());
Assert.assertFalse("There should be no notifications of any other kind", invalidResponse.get());
} else {
Assert.assertEquals("Failure detector should have been notified", 1, failedReplicaIds.size());
Assert.assertEquals("Failed notification should have arrived for the failed replica", replicaIdToFail, failedReplicaIds.get(0));
Assert.assertEquals("Successful notification should have arrived for replicas that were up", opHelper.requestParallelism - 1, successfulResponseCount.get());
Assert.assertFalse("There should be no notifications of any other kind", invalidResponse.get());
}
}
use of com.github.ambry.clustermap.ReplicaId in project ambry by linkedin.
the class VcrReplicationManager method addReplica.
/**
* Add a replica of given {@link PartitionId} and its {@link RemoteReplicaInfo}s to backup list.
* @param partitionId the {@link PartitionId} of the replica to add.
* @throws ReplicationException if replicas initialization failed.
*/
void addReplica(PartitionId partitionId) throws ReplicationException {
if (partitionToPartitionInfo.containsKey(partitionId)) {
throw new ReplicationException("Partition " + partitionId + " already exists on " + dataNodeId);
}
ReplicaId cloudReplica = new CloudReplica(partitionId, vcrClusterParticipant.getCurrentDataNodeId());
if (!storeManager.addBlobStore(cloudReplica)) {
logger.error("Can't start cloudstore for replica {}", cloudReplica);
throw new ReplicationException("Can't start cloudstore for replica " + cloudReplica);
}
List<? extends ReplicaId> peerReplicas = cloudReplica.getPeerReplicaIds();
List<RemoteReplicaInfo> remoteReplicaInfos = new ArrayList<>();
Store store = storeManager.getStore(partitionId);
if (peerReplicas != null) {
for (ReplicaId peerReplica : peerReplicas) {
if (!shouldReplicateFromDc(peerReplica.getDataNodeId().getDatacenterName())) {
continue;
}
// We need to ensure that a replica token gets persisted only after the corresponding data in the
// store gets flushed to cloud. We use the store flush interval multiplied by a constant factor
// to determine the token flush interval
FindTokenFactory findTokenFactory = tokenHelper.getFindTokenFactoryFromReplicaType(peerReplica.getReplicaType());
RemoteReplicaInfo remoteReplicaInfo = new RemoteReplicaInfo(peerReplica, cloudReplica, store, findTokenFactory.getNewFindToken(), storeConfig.storeDataFlushIntervalSeconds * SystemTime.MsPerSec * Replication_Delay_Multiplier, SystemTime.getInstance(), peerReplica.getDataNodeId().getPortToConnectTo());
replicationMetrics.addMetricsForRemoteReplicaInfo(remoteReplicaInfo, trackPerDatacenterLagInMetric);
remoteReplicaInfos.add(remoteReplicaInfo);
}
rwLock.writeLock().lock();
try {
updatePartitionInfoMaps(remoteReplicaInfos, cloudReplica);
partitionStoreMap.put(partitionId.toPathString(), store);
// Reload replication token if exist.
int tokenReloadFailCount = reloadReplicationTokenIfExists(cloudReplica, remoteReplicaInfos);
vcrMetrics.tokenReloadWarnCount.inc(tokenReloadFailCount);
// Add remoteReplicaInfos to {@link ReplicaThread}.
addRemoteReplicaInfoToReplicaThread(remoteReplicaInfos, true);
if (replicationConfig.replicationTrackPerPartitionLagFromRemote) {
replicationMetrics.addLagMetricForPartition(partitionId, true);
}
} finally {
rwLock.writeLock().unlock();
}
} else {
try {
storeManager.shutdownBlobStore(partitionId);
storeManager.removeBlobStore(partitionId);
} finally {
throw new ReplicationException("Failed to add Partition " + partitionId + " on " + dataNodeId + " , because no peer replicas found.");
}
}
}
Aggregations