use of com.github.ambry.replication.ReplicationException in project ambry by linkedin.
the class VcrReplicationManager method addReplica.
/**
* Add a replica of given {@link PartitionId} and its {@link RemoteReplicaInfo}s to backup list.
* @param partitionId the {@link PartitionId} of the replica to add.
* @throws ReplicationException if replicas initialization failed.
*/
void addReplica(PartitionId partitionId) throws ReplicationException {
if (partitionToPartitionInfo.containsKey(partitionId)) {
throw new ReplicationException("Partition " + partitionId + " already exists on " + dataNodeId);
}
ReplicaId cloudReplica = new CloudReplica(partitionId, vcrClusterParticipant.getCurrentDataNodeId());
if (!storeManager.addBlobStore(cloudReplica)) {
logger.error("Can't start cloudstore for replica {}", cloudReplica);
throw new ReplicationException("Can't start cloudstore for replica " + cloudReplica);
}
List<? extends ReplicaId> peerReplicas = cloudReplica.getPeerReplicaIds();
List<RemoteReplicaInfo> remoteReplicaInfos = new ArrayList<>();
Store store = storeManager.getStore(partitionId);
if (peerReplicas != null) {
for (ReplicaId peerReplica : peerReplicas) {
if (!shouldReplicateFromDc(peerReplica.getDataNodeId().getDatacenterName())) {
continue;
}
// We need to ensure that a replica token gets persisted only after the corresponding data in the
// store gets flushed to cloud. We use the store flush interval multiplied by a constant factor
// to determine the token flush interval
FindTokenFactory findTokenFactory = tokenHelper.getFindTokenFactoryFromReplicaType(peerReplica.getReplicaType());
RemoteReplicaInfo remoteReplicaInfo = new RemoteReplicaInfo(peerReplica, cloudReplica, store, findTokenFactory.getNewFindToken(), storeConfig.storeDataFlushIntervalSeconds * SystemTime.MsPerSec * Replication_Delay_Multiplier, SystemTime.getInstance(), peerReplica.getDataNodeId().getPortToConnectTo());
replicationMetrics.addMetricsForRemoteReplicaInfo(remoteReplicaInfo, trackPerDatacenterLagInMetric);
remoteReplicaInfos.add(remoteReplicaInfo);
}
rwLock.writeLock().lock();
try {
updatePartitionInfoMaps(remoteReplicaInfos, cloudReplica);
partitionStoreMap.put(partitionId.toPathString(), store);
// Reload replication token if exist.
int tokenReloadFailCount = reloadReplicationTokenIfExists(cloudReplica, remoteReplicaInfos);
vcrMetrics.tokenReloadWarnCount.inc(tokenReloadFailCount);
// Add remoteReplicaInfos to {@link ReplicaThread}.
addRemoteReplicaInfoToReplicaThread(remoteReplicaInfos, true);
if (replicationConfig.replicationTrackPerPartitionLagFromRemote) {
replicationMetrics.addLagMetricForPartition(partitionId, true);
}
} finally {
rwLock.writeLock().unlock();
}
} else {
try {
storeManager.shutdownBlobStore(partitionId);
storeManager.removeBlobStore(partitionId);
} finally {
throw new ReplicationException("Failed to add Partition " + partitionId + " on " + dataNodeId + " , because no peer replicas found.");
}
}
}
use of com.github.ambry.replication.ReplicationException in project ambry by linkedin.
the class VcrReplicationManager method start.
@Override
public void start() throws ReplicationException {
// Add listener for new coming assigned partition
vcrClusterParticipant.addListener(new VcrClusterParticipantListener() {
@Override
public void onPartitionAdded(PartitionId partitionId) {
if (partitionId.isEqual(cloudConfig.vcrHelixUpdaterPartitionId)) {
vcrHelixUpdateLock.lock();
try {
if (!isAmbryListenerToUpdateVcrHelixRegistered) {
// Prepare the vcrUpdateDistributedLock. Only one instance can update vcr helix cluster at one time.
// It's possible isVcrHelixUpdater to be true on two nodes.
// For example, at time "t" node A is the owner of partition 1. Due to some partition reassignment
// (lets say new node addition), partition 1 get assigned to node B at time "t+1". In this case it's possible
// for Node B to get notification of addPartition of partition 1 at "t+2" before Node A gets removePartition
// notification (at t+4). If a main cluster update happens between "t+2" and "t+4", then two nodes might try
// to update vcr cluster at the same time. Therefore, we need this distributed lock.
LockScope distributedLockScope = new HelixLockScope(HelixLockScope.LockScopeProperty.CLUSTER, Arrays.asList(cloudConfig.vcrClusterName, cloudConfig.vcrClusterName));
vcrUpdateDistributedLock = new ZKDistributedNonblockingLock(distributedLockScope, cloudConfig.vcrClusterZkConnectString, cloudConfig.vcrHelixLockTimeoutInMs, "Updating VCR Cluster", clusterMapConfig.clusterMapHostName);
// Only register the listener once. Unfortunately, we can't unregister a listener, so we use
// isAmbryListenerToUpdateVcrHelixRegistered as the flag.
clusterMap.registerClusterMapListener(new AmbryListenerToUpdateVcrHelix());
isAmbryListenerToUpdateVcrHelixRegistered = true;
// Schedule a fixed rate task to check if ambry helix and vcr helix on sync.
ambryVcrHelixSyncCheckTaskFuture = scheduler.scheduleAtFixedRate(() -> checkAmbryHelixAndVcrHelixOnSync(), cloudConfig.vcrHelixSyncCheckIntervalInSeconds, cloudConfig.vcrHelixSyncCheckIntervalInSeconds, TimeUnit.SECONDS);
logger.info("VCR updater registered.");
}
isVcrHelixUpdater = true;
scheduleVcrHelix("VCR starts");
} finally {
vcrHelixUpdateLock.unlock();
}
}
try {
addReplica(partitionId);
logger.info("Partition {} added to {}", partitionId, dataNodeId);
} catch (ReplicationException e) {
vcrMetrics.addPartitionErrorCount.inc();
logger.error("Exception on adding Partition {} to {}: ", partitionId, dataNodeId, e);
} catch (Exception e) {
// Helix will run into error state if exception throws in Helix context.
vcrMetrics.addPartitionErrorCount.inc();
logger.error("Unknown Exception on adding Partition {} to {}: ", partitionId, dataNodeId, e);
}
}
@Override
public void onPartitionRemoved(PartitionId partitionId) {
if (partitionId.isEqual(cloudConfig.vcrHelixUpdaterPartitionId)) {
vcrHelixUpdateLock.lock();
try {
isVcrHelixUpdater = false;
if (vcrHelixUpdateFuture != null) {
vcrHelixUpdateFuture.cancel(false);
}
if (ambryVcrHelixSyncCheckTaskFuture != null) {
ambryVcrHelixSyncCheckTaskFuture.cancel(false);
}
} finally {
vcrHelixUpdateLock.unlock();
}
}
try {
removeReplica(partitionId);
} catch (Exception e) {
// Helix will run into error state if exception throws in Helix context.
vcrMetrics.removePartitionErrorCount.inc();
logger.error("Exception on removing Partition {} from {}: ", partitionId, dataNodeId, e);
}
}
});
try {
vcrClusterParticipant.participate();
} catch (Exception e) {
throw new ReplicationException("Cluster participate failed.", e);
}
// start background persistent thread
// start scheduler thread to persist index in the background
scheduleTask(persistor, true, replicationConfig.replicationTokenFlushDelaySeconds, replicationConfig.replicationTokenFlushIntervalSeconds, "replica token persistor");
// Schedule thread to purge dead blobs for this VCR's partitions
// after delay to allow startup to finish.
scheduleTask(cloudStorageCompactor, cloudConfig.cloudBlobCompactionEnabled, cloudConfig.cloudBlobCompactionStartupDelaySecs, TimeUnit.HOURS.toSeconds(cloudConfig.cloudBlobCompactionIntervalHours), "cloud blob compaction");
// Schedule thread to purge blobs belonging to deprecated containers for this VCR's partitions
// after delay to allow startup to finish.
scheduleTask(() -> cloudContainerCompactor.compactAssignedDeprecatedContainers(vcrClusterParticipant.getAssignedPartitionIds()), cloudConfig.cloudContainerCompactionEnabled, cloudConfig.cloudContainerCompactionStartupDelaySecs, TimeUnit.HOURS.toSeconds(cloudConfig.cloudContainerCompactionIntervalHours), "cloud container compaction");
started = true;
startupLatch.countDown();
}
use of com.github.ambry.replication.ReplicationException in project ambry by linkedin.
the class CloudTokenPersistor method retrieve.
@Override
public List<ReplicaTokenInfo> retrieve(String mountPath) throws ReplicationException {
try {
ByteArrayOutputStream tokenOutputStream = new ByteArrayOutputStream(4096);
boolean tokenExists = cloudDestination.retrieveTokens(mountPath, replicaTokenFileName, tokenOutputStream);
if (tokenExists) {
InputStream inputStream = new ByteArrayInputStream(tokenOutputStream.toByteArray());
return replicaTokenSerde.deserializeTokens(inputStream);
} else {
return Collections.emptyList();
}
} catch (IOException | CloudStorageException e) {
throw new ReplicationException("IO error while reading from replica token file at mount path " + mountPath, e);
}
}
Aggregations