Search in sources :

Example 1 with ReplicationException

use of com.github.ambry.replication.ReplicationException in project ambry by linkedin.

the class VcrReplicationManager method addReplica.

/**
 * Add a replica of given {@link PartitionId} and its {@link RemoteReplicaInfo}s to backup list.
 * @param partitionId the {@link PartitionId} of the replica to add.
 * @throws ReplicationException if replicas initialization failed.
 */
void addReplica(PartitionId partitionId) throws ReplicationException {
    if (partitionToPartitionInfo.containsKey(partitionId)) {
        throw new ReplicationException("Partition " + partitionId + " already exists on " + dataNodeId);
    }
    ReplicaId cloudReplica = new CloudReplica(partitionId, vcrClusterParticipant.getCurrentDataNodeId());
    if (!storeManager.addBlobStore(cloudReplica)) {
        logger.error("Can't start cloudstore for replica {}", cloudReplica);
        throw new ReplicationException("Can't start cloudstore for replica " + cloudReplica);
    }
    List<? extends ReplicaId> peerReplicas = cloudReplica.getPeerReplicaIds();
    List<RemoteReplicaInfo> remoteReplicaInfos = new ArrayList<>();
    Store store = storeManager.getStore(partitionId);
    if (peerReplicas != null) {
        for (ReplicaId peerReplica : peerReplicas) {
            if (!shouldReplicateFromDc(peerReplica.getDataNodeId().getDatacenterName())) {
                continue;
            }
            // We need to ensure that a replica token gets persisted only after the corresponding data in the
            // store gets flushed to cloud. We use the store flush interval multiplied by a constant factor
            // to determine the token flush interval
            FindTokenFactory findTokenFactory = tokenHelper.getFindTokenFactoryFromReplicaType(peerReplica.getReplicaType());
            RemoteReplicaInfo remoteReplicaInfo = new RemoteReplicaInfo(peerReplica, cloudReplica, store, findTokenFactory.getNewFindToken(), storeConfig.storeDataFlushIntervalSeconds * SystemTime.MsPerSec * Replication_Delay_Multiplier, SystemTime.getInstance(), peerReplica.getDataNodeId().getPortToConnectTo());
            replicationMetrics.addMetricsForRemoteReplicaInfo(remoteReplicaInfo, trackPerDatacenterLagInMetric);
            remoteReplicaInfos.add(remoteReplicaInfo);
        }
        rwLock.writeLock().lock();
        try {
            updatePartitionInfoMaps(remoteReplicaInfos, cloudReplica);
            partitionStoreMap.put(partitionId.toPathString(), store);
            // Reload replication token if exist.
            int tokenReloadFailCount = reloadReplicationTokenIfExists(cloudReplica, remoteReplicaInfos);
            vcrMetrics.tokenReloadWarnCount.inc(tokenReloadFailCount);
            // Add remoteReplicaInfos to {@link ReplicaThread}.
            addRemoteReplicaInfoToReplicaThread(remoteReplicaInfos, true);
            if (replicationConfig.replicationTrackPerPartitionLagFromRemote) {
                replicationMetrics.addLagMetricForPartition(partitionId, true);
            }
        } finally {
            rwLock.writeLock().unlock();
        }
    } else {
        try {
            storeManager.shutdownBlobStore(partitionId);
            storeManager.removeBlobStore(partitionId);
        } finally {
            throw new ReplicationException("Failed to add Partition " + partitionId + " on " + dataNodeId + " , because no peer replicas found.");
        }
    }
}
Also used : CloudReplica(com.github.ambry.clustermap.CloudReplica) RemoteReplicaInfo(com.github.ambry.replication.RemoteReplicaInfo) ArrayList(java.util.ArrayList) Store(com.github.ambry.store.Store) ReplicationException(com.github.ambry.replication.ReplicationException) FindTokenFactory(com.github.ambry.replication.FindTokenFactory) ReplicaId(com.github.ambry.clustermap.ReplicaId)

Example 2 with ReplicationException

use of com.github.ambry.replication.ReplicationException in project ambry by linkedin.

the class VcrReplicationManager method start.

@Override
public void start() throws ReplicationException {
    // Add listener for new coming assigned partition
    vcrClusterParticipant.addListener(new VcrClusterParticipantListener() {

        @Override
        public void onPartitionAdded(PartitionId partitionId) {
            if (partitionId.isEqual(cloudConfig.vcrHelixUpdaterPartitionId)) {
                vcrHelixUpdateLock.lock();
                try {
                    if (!isAmbryListenerToUpdateVcrHelixRegistered) {
                        // Prepare the vcrUpdateDistributedLock. Only one instance can update vcr helix cluster at one time.
                        // It's possible isVcrHelixUpdater to be true on two nodes.
                        // For example, at time "t" node A is the owner of partition 1. Due to some partition reassignment
                        // (lets say new node addition), partition 1 get assigned to node B at time "t+1". In this case it's possible
                        // for Node B to get notification of addPartition of partition 1 at "t+2" before Node A gets removePartition
                        // notification (at t+4). If a main cluster update happens between "t+2" and "t+4", then two nodes might try
                        // to update vcr cluster at the same time. Therefore, we need this distributed lock.
                        LockScope distributedLockScope = new HelixLockScope(HelixLockScope.LockScopeProperty.CLUSTER, Arrays.asList(cloudConfig.vcrClusterName, cloudConfig.vcrClusterName));
                        vcrUpdateDistributedLock = new ZKDistributedNonblockingLock(distributedLockScope, cloudConfig.vcrClusterZkConnectString, cloudConfig.vcrHelixLockTimeoutInMs, "Updating VCR Cluster", clusterMapConfig.clusterMapHostName);
                        // Only register the listener once. Unfortunately, we can't unregister a listener, so we use
                        // isAmbryListenerToUpdateVcrHelixRegistered as the flag.
                        clusterMap.registerClusterMapListener(new AmbryListenerToUpdateVcrHelix());
                        isAmbryListenerToUpdateVcrHelixRegistered = true;
                        // Schedule a fixed rate task to check if ambry helix and vcr helix on sync.
                        ambryVcrHelixSyncCheckTaskFuture = scheduler.scheduleAtFixedRate(() -> checkAmbryHelixAndVcrHelixOnSync(), cloudConfig.vcrHelixSyncCheckIntervalInSeconds, cloudConfig.vcrHelixSyncCheckIntervalInSeconds, TimeUnit.SECONDS);
                        logger.info("VCR updater registered.");
                    }
                    isVcrHelixUpdater = true;
                    scheduleVcrHelix("VCR starts");
                } finally {
                    vcrHelixUpdateLock.unlock();
                }
            }
            try {
                addReplica(partitionId);
                logger.info("Partition {} added to {}", partitionId, dataNodeId);
            } catch (ReplicationException e) {
                vcrMetrics.addPartitionErrorCount.inc();
                logger.error("Exception on adding Partition {} to {}: ", partitionId, dataNodeId, e);
            } catch (Exception e) {
                // Helix will run into error state if exception throws in Helix context.
                vcrMetrics.addPartitionErrorCount.inc();
                logger.error("Unknown Exception on adding Partition {} to {}: ", partitionId, dataNodeId, e);
            }
        }

        @Override
        public void onPartitionRemoved(PartitionId partitionId) {
            if (partitionId.isEqual(cloudConfig.vcrHelixUpdaterPartitionId)) {
                vcrHelixUpdateLock.lock();
                try {
                    isVcrHelixUpdater = false;
                    if (vcrHelixUpdateFuture != null) {
                        vcrHelixUpdateFuture.cancel(false);
                    }
                    if (ambryVcrHelixSyncCheckTaskFuture != null) {
                        ambryVcrHelixSyncCheckTaskFuture.cancel(false);
                    }
                } finally {
                    vcrHelixUpdateLock.unlock();
                }
            }
            try {
                removeReplica(partitionId);
            } catch (Exception e) {
                // Helix will run into error state if exception throws in Helix context.
                vcrMetrics.removePartitionErrorCount.inc();
                logger.error("Exception on removing Partition {} from {}: ", partitionId, dataNodeId, e);
            }
        }
    });
    try {
        vcrClusterParticipant.participate();
    } catch (Exception e) {
        throw new ReplicationException("Cluster participate failed.", e);
    }
    // start background persistent thread
    // start scheduler thread to persist index in the background
    scheduleTask(persistor, true, replicationConfig.replicationTokenFlushDelaySeconds, replicationConfig.replicationTokenFlushIntervalSeconds, "replica token persistor");
    // Schedule thread to purge dead blobs for this VCR's partitions
    // after delay to allow startup to finish.
    scheduleTask(cloudStorageCompactor, cloudConfig.cloudBlobCompactionEnabled, cloudConfig.cloudBlobCompactionStartupDelaySecs, TimeUnit.HOURS.toSeconds(cloudConfig.cloudBlobCompactionIntervalHours), "cloud blob compaction");
    // Schedule thread to purge blobs belonging to deprecated containers for this VCR's partitions
    // after delay to allow startup to finish.
    scheduleTask(() -> cloudContainerCompactor.compactAssignedDeprecatedContainers(vcrClusterParticipant.getAssignedPartitionIds()), cloudConfig.cloudContainerCompactionEnabled, cloudConfig.cloudContainerCompactionStartupDelaySecs, TimeUnit.HOURS.toSeconds(cloudConfig.cloudContainerCompactionIntervalHours), "cloud container compaction");
    started = true;
    startupLatch.countDown();
}
Also used : VcrClusterParticipantListener(com.github.ambry.clustermap.VcrClusterParticipantListener) LockScope(org.apache.helix.lock.LockScope) HelixLockScope(org.apache.helix.lock.helix.HelixLockScope) HelixLockScope(org.apache.helix.lock.helix.HelixLockScope) ReplicationException(com.github.ambry.replication.ReplicationException) PartitionId(com.github.ambry.clustermap.PartitionId) ZKDistributedNonblockingLock(org.apache.helix.lock.helix.ZKDistributedNonblockingLock) JsonProcessingException(com.fasterxml.jackson.core.JsonProcessingException) ReplicationException(com.github.ambry.replication.ReplicationException)

Example 3 with ReplicationException

use of com.github.ambry.replication.ReplicationException in project ambry by linkedin.

the class CloudTokenPersistor method retrieve.

@Override
public List<ReplicaTokenInfo> retrieve(String mountPath) throws ReplicationException {
    try {
        ByteArrayOutputStream tokenOutputStream = new ByteArrayOutputStream(4096);
        boolean tokenExists = cloudDestination.retrieveTokens(mountPath, replicaTokenFileName, tokenOutputStream);
        if (tokenExists) {
            InputStream inputStream = new ByteArrayInputStream(tokenOutputStream.toByteArray());
            return replicaTokenSerde.deserializeTokens(inputStream);
        } else {
            return Collections.emptyList();
        }
    } catch (IOException | CloudStorageException e) {
        throw new ReplicationException("IO error while reading from replica token file at mount path " + mountPath, e);
    }
}
Also used : ByteArrayInputStream(java.io.ByteArrayInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) ReplicationException(com.github.ambry.replication.ReplicationException) ByteArrayOutputStream(java.io.ByteArrayOutputStream) IOException(java.io.IOException)

Aggregations

ReplicationException (com.github.ambry.replication.ReplicationException)3 JsonProcessingException (com.fasterxml.jackson.core.JsonProcessingException)1 CloudReplica (com.github.ambry.clustermap.CloudReplica)1 PartitionId (com.github.ambry.clustermap.PartitionId)1 ReplicaId (com.github.ambry.clustermap.ReplicaId)1 VcrClusterParticipantListener (com.github.ambry.clustermap.VcrClusterParticipantListener)1 FindTokenFactory (com.github.ambry.replication.FindTokenFactory)1 RemoteReplicaInfo (com.github.ambry.replication.RemoteReplicaInfo)1 Store (com.github.ambry.store.Store)1 ByteArrayInputStream (java.io.ByteArrayInputStream)1 ByteArrayOutputStream (java.io.ByteArrayOutputStream)1 IOException (java.io.IOException)1 InputStream (java.io.InputStream)1 ArrayList (java.util.ArrayList)1 LockScope (org.apache.helix.lock.LockScope)1 HelixLockScope (org.apache.helix.lock.helix.HelixLockScope)1 ZKDistributedNonblockingLock (org.apache.helix.lock.helix.ZKDistributedNonblockingLock)1