use of org.apache.asterix.common.transactions.ILogManager in project asterixdb by apache.
the class RemoteRecoveryManager method completeFailbackProcess.
@Override
public void completeFailbackProcess() throws IOException, InterruptedException {
ILogManager logManager = runtimeContext.getTransactionSubsystem().getLogManager();
ReplicaResourcesManager replicaResourcesManager = (ReplicaResourcesManager) runtimeContext.getReplicaResourcesManager();
Map<String, ClusterPartition[]> nodePartitions = runtimeContext.getMetadataProperties().getNodePartitions();
/**
* for each lost partition, get the remaining files from replicas
* to complete the failback process.
*/
try {
for (Entry<String, Set<String>> remoteReplica : failbackRecoveryReplicas.entrySet()) {
String replicaId = remoteReplica.getKey();
Set<String> NCsDataToRecover = remoteReplica.getValue();
Set<String> existingFiles = new HashSet<>();
Set<Integer> partitionsToRecover = new HashSet<>();
for (String nodeId : NCsDataToRecover) {
//get partitions that will be recovered from this node
ClusterPartition[] replicaPartitions = nodePartitions.get(nodeId);
for (ClusterPartition partition : replicaPartitions) {
existingFiles.addAll(replicaResourcesManager.getPartitionIndexesFiles(partition.getPartitionId(), true));
partitionsToRecover.add(partition.getPartitionId());
}
}
//Request remaining indexes files
replicationManager.requestReplicaFiles(replicaId, partitionsToRecover, existingFiles);
}
} catch (IOException e) {
/**
* in case of failure during failback completion process we need to construct a new plan
* and get all the files from the start since the remote replicas will change in the new plan.
*/
if (LOGGER.isLoggable(Level.WARNING)) {
LOGGER.log(Level.WARNING, "Failed during completing failback. Restarting failback process...", e);
}
startFailbackProcess();
}
//get max LSN from selected remote replicas
long maxRemoteLSN = replicationManager.getMaxRemoteLSN(failbackRecoveryReplicas.keySet());
//6. force LogManager to start from a partition > maxLSN in selected remote replicas
logManager.renewLogFilesAndStartFromLSN(maxRemoteLSN);
//start replication service after failback completed
runtimeContext.getReplicationChannel().start();
runtimeContext.getReplicationManager().startReplicationThreads();
failbackRecoveryReplicas = null;
}
use of org.apache.asterix.common.transactions.ILogManager in project asterixdb by apache.
the class AbstractCheckpointManager method capture.
protected void capture(long minMCTFirstLSN, boolean sharp) throws HyracksDataException {
ILogManager logMgr = txnSubsystem.getLogManager();
ITransactionManager txnMgr = txnSubsystem.getTransactionManager();
Checkpoint checkpointObject = new Checkpoint(logMgr.getAppendLSN(), minMCTFirstLSN, txnMgr.getMaxJobId(), System.currentTimeMillis(), sharp, StorageConstants.VERSION);
persist(checkpointObject);
cleanup();
}
use of org.apache.asterix.common.transactions.ILogManager in project asterixdb by apache.
the class RemoteRecoveryManager method replayReplicaPartitionLogs.
@Override
public void replayReplicaPartitionLogs(Set<Integer> partitions, boolean flush) throws HyracksDataException {
ILogManager logManager = runtimeContext.getTransactionSubsystem().getLogManager();
long minLSN = runtimeContext.getReplicaResourcesManager().getPartitionsMinLSN(partitions);
long readableSmallestLSN = logManager.getReadableSmallestLSN();
if (minLSN < readableSmallestLSN) {
minLSN = readableSmallestLSN;
}
//replay logs > minLSN that belong to these partitions
IRecoveryManager recoveryManager = runtimeContext.getTransactionSubsystem().getRecoveryManager();
try {
recoveryManager.replayPartitionsLogs(partitions, logManager.getLogReader(true), minLSN);
if (flush) {
runtimeContext.getDatasetLifecycleManager().flushAllDatasets();
}
} catch (IOException | ACIDException e) {
throw new HyracksDataException(e);
}
}
use of org.apache.asterix.common.transactions.ILogManager in project asterixdb by apache.
the class RemoteRecoveryManager method doRemoteRecoveryPlan.
//TODO refactor common code between remote recovery and failback process
@Override
public void doRemoteRecoveryPlan(Map<String, Set<Integer>> recoveryPlan) throws HyracksDataException {
int maxRecoveryAttempts = replicationProperties.getMaxRemoteRecoveryAttempts();
PersistentLocalResourceRepository resourceRepository = (PersistentLocalResourceRepository) runtimeContext.getLocalResourceRepository();
IDatasetLifecycleManager datasetLifeCycleManager = runtimeContext.getDatasetLifecycleManager();
ILogManager logManager = runtimeContext.getTransactionSubsystem().getLogManager();
while (true) {
//start recovery steps
try {
if (maxRecoveryAttempts <= 0) {
//to avoid infinite loop in case of unexpected behavior.
throw new IllegalStateException("Failed to perform remote recovery.");
}
/*** Prepare for Recovery ***/
//1. clean any memory data that could've existed from previous failed recovery attempt
datasetLifeCycleManager.closeAllDatasets();
//2. remove any existing storage data and initialize storage metadata
resourceRepository.deleteStorageData(true);
resourceRepository.initializeNewUniverse(ClusterProperties.INSTANCE.getStorageDirectoryName());
/*** Start Recovery Per Lost Replica ***/
for (Entry<String, Set<Integer>> remoteReplica : recoveryPlan.entrySet()) {
String replicaId = remoteReplica.getKey();
Set<Integer> partitionsToRecover = remoteReplica.getValue();
//Request indexes metadata and LSM components
replicationManager.requestReplicaFiles(replicaId, partitionsToRecover, new HashSet<String>());
}
//get max LSN from selected remote replicas
long maxRemoteLSN = replicationManager.getMaxRemoteLSN(recoveryPlan.keySet());
//6. force LogManager to start from a partition > maxLSN in selected remote replicas
logManager.renewLogFilesAndStartFromLSN(maxRemoteLSN);
break;
} catch (IOException e) {
if (LOGGER.isLoggable(Level.WARNING)) {
LOGGER.log(Level.WARNING, "Failed during remote recovery. Attempting again...", e);
}
maxRecoveryAttempts--;
}
}
}
Aggregations