Search in sources :

Example 66 with PartitionId

use of com.github.ambry.clustermap.PartitionId in project ambry by linkedin.

the class ReplicationTest method replicaFromInactiveToOfflineTest.

/**
 * Test INACTIVE -> OFFLINE transition on existing replica (both success and failure cases)
 */
@Test
public void replicaFromInactiveToOfflineTest() throws Exception {
    MockClusterMap clusterMap = new MockClusterMap();
    ClusterMapConfig clusterMapConfig = new ClusterMapConfig(verifiableProperties);
    MockHelixParticipant.metricRegistry = new MetricRegistry();
    MockHelixParticipant mockHelixParticipant = new MockHelixParticipant(clusterMapConfig);
    Pair<StorageManager, ReplicationManager> managers = createStorageManagerAndReplicationManager(clusterMap, clusterMapConfig, mockHelixParticipant);
    StorageManager storageManager = managers.getFirst();
    MockReplicationManager replicationManager = (MockReplicationManager) managers.getSecond();
    // 1. test replica not found case
    try {
        mockHelixParticipant.onPartitionBecomeOfflineFromInactive("-1");
        fail("should fail because of invalid partition");
    } catch (StateTransitionException e) {
        assertEquals("Error code doesn't match", ReplicaNotFound, e.getErrorCode());
    }
    // 2. test store not started case
    PartitionId existingPartition = replicationManager.partitionToPartitionInfo.keySet().iterator().next();
    storageManager.shutdownBlobStore(existingPartition);
    try {
        mockHelixParticipant.onPartitionBecomeOfflineFromInactive(existingPartition.toPathString());
        fail("should fail because store is not started");
    } catch (StateTransitionException e) {
        assertEquals("Error code doesn't match", StoreNotStarted, e.getErrorCode());
    }
    storageManager.startBlobStore(existingPartition);
    // before testing success case, let's write a blob (size = 100) into local store and add a delete record for new blob
    Store localStore = storageManager.getStore(existingPartition);
    MockId id = new MockId(TestUtils.getRandomString(10), Utils.getRandomShort(TestUtils.RANDOM), Utils.getRandomShort(TestUtils.RANDOM));
    long crc = (new Random()).nextLong();
    long blobSize = 100;
    MessageInfo info = new MessageInfo(id, blobSize, false, false, Utils.Infinite_Time, crc, id.getAccountId(), id.getContainerId(), Utils.Infinite_Time);
    List<MessageInfo> infos = new ArrayList<>();
    List<ByteBuffer> buffers = new ArrayList<>();
    ByteBuffer buffer = ByteBuffer.wrap(TestUtils.getRandomBytes((int) blobSize));
    infos.add(info);
    buffers.add(buffer);
    localStore.put(new MockMessageWriteSet(infos, buffers));
    // delete the blob
    int deleteRecordSize = (int) (new DeleteMessageFormatInputStream(id, (short) 0, (short) 0, 0).getSize());
    MessageInfo deleteInfo = new MessageInfo(id, deleteRecordSize, id.getAccountId(), id.getContainerId(), time.milliseconds());
    localStore.delete(Collections.singletonList(deleteInfo));
    int sizeOfPutAndHeader = 100 + 18;
    int sizeOfWhole = sizeOfPutAndHeader + deleteRecordSize;
    // note that end offset of last PUT = 100 + 18 = 118, end offset of the store is sizeOfWhole
    // 3. test success case (create a new thread and trigger INACTIVE -> OFFLINE transition)
    ReplicaId localReplica = storageManager.getReplica(existingPartition.toPathString());
    // put a decommission-in-progress file into local store dir
    File decommissionFile = new File(localReplica.getReplicaPath(), "decommission_in_progress");
    assertTrue("Couldn't create decommission file in local store", decommissionFile.createNewFile());
    decommissionFile.deleteOnExit();
    assertNotSame("Before disconnection, the local store state shouldn't be OFFLINE", ReplicaState.OFFLINE, localStore.getCurrentState());
    mockHelixParticipant.registerPartitionStateChangeListener(StateModelListenerType.ReplicationManagerListener, replicationManager.replicationListener);
    CountDownLatch participantLatch = new CountDownLatch(1);
    replicationManager.listenerExecutionLatch = new CountDownLatch(1);
    Utils.newThread(() -> {
        mockHelixParticipant.onPartitionBecomeOfflineFromInactive(existingPartition.toPathString());
        participantLatch.countDown();
    }, false).start();
    assertTrue("Partition state change listener in ReplicationManager didn't get called within 1 sec", replicationManager.listenerExecutionLatch.await(1, TimeUnit.SECONDS));
    // the state of local store should be updated to OFFLINE
    assertEquals("Local store state is not expected", ReplicaState.OFFLINE, localStore.getCurrentState());
    // update replication lag between local and peer replicas
    List<RemoteReplicaInfo> remoteReplicaInfos = replicationManager.partitionToPartitionInfo.get(existingPartition).getRemoteReplicaInfos();
    ReplicaId peerReplica1 = remoteReplicaInfos.get(0).getReplicaId();
    ReplicaId peerReplica2 = remoteReplicaInfos.get(1).getReplicaId();
    // peer1 catches up with last PUT, peer2 catches up with end offset of local store. In this case, SyncUp is not complete
    replicationManager.updateTotalBytesReadByRemoteReplica(existingPartition, peerReplica1.getDataNodeId().getHostname(), peerReplica1.getReplicaPath(), sizeOfPutAndHeader);
    replicationManager.updateTotalBytesReadByRemoteReplica(existingPartition, peerReplica2.getDataNodeId().getHostname(), peerReplica2.getReplicaPath(), sizeOfWhole);
    assertFalse("Only one peer replica has fully caught up with end offset so sync-up should not complete", mockHelixParticipant.getReplicaSyncUpManager().isSyncUpComplete(localReplica));
    // make peer1 catch up with end offset
    replicationManager.updateTotalBytesReadByRemoteReplica(existingPartition, peerReplica1.getDataNodeId().getHostname(), peerReplica1.getReplicaPath(), sizeOfWhole);
    // Now, sync-up should complete and transition should be able to proceed.
    assertTrue("Inactive-To-Offline transition didn't complete within 1 sec", participantLatch.await(1, TimeUnit.SECONDS));
    assertFalse("Local store should be stopped after transition", localStore.isStarted());
    storageManager.shutdown();
}
Also used : StorageManager(com.github.ambry.store.StorageManager) ArrayList(java.util.ArrayList) Store(com.github.ambry.store.Store) MockId(com.github.ambry.store.MockId) MockHelixParticipant(com.github.ambry.clustermap.MockHelixParticipant) Random(java.util.Random) MetricRegistry(com.codahale.metrics.MetricRegistry) DeleteMessageFormatInputStream(com.github.ambry.messageformat.DeleteMessageFormatInputStream) MockPartitionId(com.github.ambry.clustermap.MockPartitionId) PartitionId(com.github.ambry.clustermap.PartitionId) CountDownLatch(java.util.concurrent.CountDownLatch) ByteBuffer(java.nio.ByteBuffer) ClusterMapConfig(com.github.ambry.config.ClusterMapConfig) MockReplicaId(com.github.ambry.clustermap.MockReplicaId) ReplicaId(com.github.ambry.clustermap.ReplicaId) MessageInfo(com.github.ambry.store.MessageInfo) MockMessageWriteSet(com.github.ambry.store.MockMessageWriteSet) File(java.io.File) MockClusterMap(com.github.ambry.clustermap.MockClusterMap) StateTransitionException(com.github.ambry.clustermap.StateTransitionException) Test(org.junit.Test)

Example 67 with PartitionId

use of com.github.ambry.clustermap.PartitionId in project ambry by linkedin.

the class ReplicationTest method replicaFromStandbyToLeaderTest.

/**
 * Test state transition in replication manager from STANDBY to LEADER
 * Test setup: When creating partitions, make sure that there is exactly one replica in LEADER STATE on each data center
 * Test condition: When a partition on current node moves from standby to leader, verify that in-memory map storing
 * partition to peer leader replicas is updated correctly
 * @throws Exception
 */
@Test
public void replicaFromStandbyToLeaderTest() throws Exception {
    MockClusterMap clusterMap = new MockClusterMap();
    ClusterMapConfig clusterMapConfig = new ClusterMapConfig(verifiableProperties);
    MockHelixParticipant.metricRegistry = new MetricRegistry();
    MockHelixParticipant mockHelixParticipant = new MockHelixParticipant(clusterMapConfig);
    ReplicationConfig initialReplicationConfig = replicationConfig;
    properties.setProperty("replication.model.across.datacenters", "LEADER_BASED");
    replicationConfig = new ReplicationConfig(new VerifiableProperties(properties));
    Pair<StorageManager, ReplicationManager> managers = createStorageManagerAndReplicationManager(clusterMap, clusterMapConfig, mockHelixParticipant);
    StorageManager storageManager = managers.getFirst();
    MockReplicationManager replicationManager = (MockReplicationManager) managers.getSecond();
    List<ReplicaId> replicaIds = clusterMap.getReplicaIds(replicationManager.dataNodeId);
    for (ReplicaId replicaId : replicaIds) {
        MockReplicaId mockReplicaId = (MockReplicaId) replicaId;
        if (mockReplicaId.getReplicaState() == ReplicaState.LEADER) {
            PartitionId existingPartition = mockReplicaId.getPartitionId();
            mockHelixParticipant.onPartitionBecomeLeaderFromStandby(existingPartition.toPathString());
            Set<ReplicaId> peerLeaderReplicasInReplicationManager = replicationManager.leaderBasedReplicationAdmin.getLeaderPartitionToPeerLeaderReplicas().get(existingPartition.toPathString());
            Set<ReplicaId> peerLeaderReplicasInClusterMap = new HashSet<>(existingPartition.getReplicaIdsByState(ReplicaState.LEADER, null));
            peerLeaderReplicasInClusterMap.remove(mockReplicaId);
            assertThat("Mismatch in list of leader peer replicas stored by partition in replication manager and cluster map", peerLeaderReplicasInReplicationManager, is(peerLeaderReplicasInClusterMap));
        }
    }
    storageManager.shutdown();
    replicationConfig = initialReplicationConfig;
}
Also used : ReplicationConfig(com.github.ambry.config.ReplicationConfig) VerifiableProperties(com.github.ambry.config.VerifiableProperties) MetricRegistry(com.codahale.metrics.MetricRegistry) StorageManager(com.github.ambry.store.StorageManager) MockPartitionId(com.github.ambry.clustermap.MockPartitionId) PartitionId(com.github.ambry.clustermap.PartitionId) ClusterMapConfig(com.github.ambry.config.ClusterMapConfig) MockReplicaId(com.github.ambry.clustermap.MockReplicaId) ReplicaId(com.github.ambry.clustermap.ReplicaId) MockHelixParticipant(com.github.ambry.clustermap.MockHelixParticipant) MockReplicaId(com.github.ambry.clustermap.MockReplicaId) MockClusterMap(com.github.ambry.clustermap.MockClusterMap) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 68 with PartitionId

use of com.github.ambry.clustermap.PartitionId in project ambry by linkedin.

the class ReplicationTestHelper method getLocalAndRemoteHosts.

// static methods to get local and remote hosts, add put/ttlupdate/delete/undelete messages to partitions
/**
 * Selects a local and remote host for replication tests that need it.
 * @param clusterMap the {@link MockClusterMap} to use.
 * @return a {@link Pair} with the first entry being the "local" host and the second, the "remote" host.
 */
public static Pair<MockHost, MockHost> getLocalAndRemoteHosts(MockClusterMap clusterMap) {
    // to make sure we select hosts with the SPECIAL_PARTITION_CLASS, pick hosts from the replicas of that partition
    PartitionId specialPartitionId = clusterMap.getWritablePartitionIds(MockClusterMap.SPECIAL_PARTITION_CLASS).get(0);
    // these hosts have replicas of the "special" partition and all the other partitions.
    MockHost localHost = new MockHost(specialPartitionId.getReplicaIds().get(0).getDataNodeId(), clusterMap);
    MockHost remoteHost = new MockHost(specialPartitionId.getReplicaIds().get(1).getDataNodeId(), clusterMap);
    return new Pair<>(localHost, remoteHost);
}
Also used : MockPartitionId(com.github.ambry.clustermap.MockPartitionId) PartitionId(com.github.ambry.clustermap.PartitionId) Pair(com.github.ambry.utils.Pair)

Example 69 with PartitionId

use of com.github.ambry.clustermap.PartitionId in project ambry by linkedin.

the class ReplicationTestHelper method getRemoteReplicasAndReplicaThread.

/**
 * Creates and gets the remote replicas that the local host will deal with and the {@link ReplicaThread} to perform
 * replication with.
 * @param batchSize the number of messages to be returned in each iteration of replication
 * @param clusterMap the {@link ClusterMap} to use
 * @param localHost the local {@link MockHost} (the one running the replica thread)
 * @param remoteHost the remote {@link MockHost} (the target of replication)
 * @param storeKeyConverter the {@link StoreKeyConverter} to be used in {@link ReplicaThread}
 * @param transformer the {@link Transformer} to be used in {@link ReplicaThread}
 * @param listener the {@link StoreEventListener} to use.
 * @param replicaSyncUpManager the {@link ReplicaSyncUpManager} to help create replica thread
 * @return a pair whose first element is the set of remote replicas and the second element is the {@link ReplicaThread}
 */
protected Pair<Map<DataNodeId, List<RemoteReplicaInfo>>, ReplicaThread> getRemoteReplicasAndReplicaThread(int batchSize, ClusterMap clusterMap, MockHost localHost, MockHost remoteHost, StoreKeyConverter storeKeyConverter, Transformer transformer, StoreEventListener listener, ReplicaSyncUpManager replicaSyncUpManager) throws ReflectiveOperationException {
    ReplicationMetrics replicationMetrics = new ReplicationMetrics(new MetricRegistry(), clusterMap.getReplicaIds(localHost.dataNodeId));
    replicationMetrics.populateSingleColoMetrics(remoteHost.dataNodeId.getDatacenterName());
    List<RemoteReplicaInfo> remoteReplicaInfoList = localHost.getRemoteReplicaInfos(remoteHost, listener);
    Map<DataNodeId, List<RemoteReplicaInfo>> replicasToReplicate = Collections.singletonMap(remoteHost.dataNodeId, remoteReplicaInfoList);
    StoreKeyFactory storeKeyFactory = Utils.getObj("com.github.ambry.commons.BlobIdFactory", clusterMap);
    Map<DataNodeId, MockHost> hosts = new HashMap<>();
    hosts.put(remoteHost.dataNodeId, remoteHost);
    MockConnectionPool connectionPool = new MockConnectionPool(hosts, clusterMap, batchSize);
    ReplicaThread replicaThread = new ReplicaThread("threadtest", new MockFindTokenHelper(storeKeyFactory, replicationConfig), clusterMap, new AtomicInteger(0), localHost.dataNodeId, connectionPool, replicationConfig, replicationMetrics, null, storeKeyConverter, transformer, clusterMap.getMetricRegistry(), false, localHost.dataNodeId.getDatacenterName(), new ResponseHandler(clusterMap), time, replicaSyncUpManager, null, null);
    for (RemoteReplicaInfo remoteReplicaInfo : remoteReplicaInfoList) {
        replicaThread.addRemoteReplicaInfo(remoteReplicaInfo);
    }
    for (PartitionId partitionId : clusterMap.getAllPartitionIds(null)) {
        replicationMetrics.addLagMetricForPartition(partitionId, true);
    }
    return new Pair<>(replicasToReplicate, replicaThread);
}
Also used : ResponseHandler(com.github.ambry.commons.ResponseHandler) HashMap(java.util.HashMap) MetricRegistry(com.codahale.metrics.MetricRegistry) MockPartitionId(com.github.ambry.clustermap.MockPartitionId) PartitionId(com.github.ambry.clustermap.PartitionId) StoreKeyFactory(com.github.ambry.store.StoreKeyFactory) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) List(java.util.List) ArrayList(java.util.ArrayList) DataNodeId(com.github.ambry.clustermap.DataNodeId) Pair(com.github.ambry.utils.Pair)

Example 70 with PartitionId

use of com.github.ambry.clustermap.PartitionId in project ambry by linkedin.

the class CryptoJobMetricsTracker method initializeResourceToHistogramMap.

/**
 * Initialize resource-to-latency-histogram maps based on given resource type. Here resource can be {@link PartitionId},
 * {@link DataNodeId}, etc. The resource type is defined by {@link RouterConfig#routerOperationTrackerMetricScope}.
 * @param clusterMap the {@link ClusterMap} that contains info of all resources.
 * @param routerConfig the {@link RouterConfig} that specifies histogram parameters.
 */
private void initializeResourceToHistogramMap(ClusterMap clusterMap, RouterConfig routerConfig) {
    String localDatacenterName = clusterMap.getDatacenterName(clusterMap.getLocalDatacenterId());
    switch(routerConfig.routerOperationTrackerMetricScope) {
        case Partition:
            for (PartitionId partitionId : clusterMap.getAllPartitionIds(null)) {
                getBlobLocalDcResourceToLatency.put(partitionId, createHistogram(routerConfig, false));
                getBlobInfoLocalDcResourceToLatency.put(partitionId, createHistogram(routerConfig, false));
                getBlobCrossDcResourceToLatency.put(partitionId, createHistogram(routerConfig, false));
                getBlobInfoCrossDcResourceToLatency.put(partitionId, createHistogram(routerConfig, false));
                putBlobResourceToLatency.put(partitionId, createHistogram(routerConfig, false));
            }
            break;
        case DataNode:
            List<? extends DataNodeId> dataNodeIds = clusterMap.getDataNodeIds();
            for (DataNodeId dataNodeId : dataNodeIds) {
                if (dataNodeId.getDatacenterName().equals(localDatacenterName)) {
                    getBlobLocalDcResourceToLatency.put(dataNodeId, createHistogram(routerConfig, false));
                    getBlobInfoLocalDcResourceToLatency.put(dataNodeId, createHistogram(routerConfig, false));
                    // Put blob only cares abou local db data nodes.
                    putBlobResourceToLatency.put(dataNodeId, createHistogram(routerConfig, false));
                } else {
                    getBlobCrossDcResourceToLatency.put(dataNodeId, createHistogram(routerConfig, false));
                    getBlobInfoCrossDcResourceToLatency.put(dataNodeId, createHistogram(routerConfig, false));
                }
            }
            break;
        case Disk:
            for (PartitionId partitionId : clusterMap.getAllPartitionIds(null)) {
                for (ReplicaId replicaId : partitionId.getReplicaIds()) {
                    DiskId diskId = replicaId.getDiskId();
                    if (getBlobLocalDcResourceToLatency.containsKey(diskId) || getBlobCrossDcResourceToLatency.containsKey(diskId)) {
                        continue;
                    }
                    if (replicaId.getDataNodeId().getDatacenterName().equals(localDatacenterName)) {
                        getBlobLocalDcResourceToLatency.put(diskId, createHistogram(routerConfig, false));
                        getBlobInfoLocalDcResourceToLatency.put(diskId, createHistogram(routerConfig, false));
                        putBlobResourceToLatency.put(diskId, createHistogram(routerConfig, false));
                    } else {
                        getBlobCrossDcResourceToLatency.put(diskId, createHistogram(routerConfig, false));
                        getBlobInfoCrossDcResourceToLatency.put(diskId, createHistogram(routerConfig, false));
                    }
                }
            }
        default:
    }
}
Also used : PartitionId(com.github.ambry.clustermap.PartitionId) DataNodeId(com.github.ambry.clustermap.DataNodeId) ReplicaId(com.github.ambry.clustermap.ReplicaId) DiskId(com.github.ambry.clustermap.DiskId)

Aggregations

PartitionId (com.github.ambry.clustermap.PartitionId)183 MockPartitionId (com.github.ambry.clustermap.MockPartitionId)111 Test (org.junit.Test)95 ReplicaId (com.github.ambry.clustermap.ReplicaId)70 ArrayList (java.util.ArrayList)68 MockClusterMap (com.github.ambry.clustermap.MockClusterMap)53 BlobId (com.github.ambry.commons.BlobId)50 HashMap (java.util.HashMap)48 Map (java.util.Map)41 List (java.util.List)40 MockDataNodeId (com.github.ambry.clustermap.MockDataNodeId)39 DataNodeId (com.github.ambry.clustermap.DataNodeId)36 MetricRegistry (com.codahale.metrics.MetricRegistry)33 ClusterMap (com.github.ambry.clustermap.ClusterMap)32 MockReplicaId (com.github.ambry.clustermap.MockReplicaId)30 VerifiableProperties (com.github.ambry.config.VerifiableProperties)30 IOException (java.io.IOException)29 HashSet (java.util.HashSet)29 StoreKey (com.github.ambry.store.StoreKey)26 StoreKeyFactory (com.github.ambry.store.StoreKeyFactory)25