Search in sources :

Example 1 with DbRepairStatus

use of com.emc.vipr.model.sys.recovery.DbRepairStatus in project coprhd-controller by CoprHD.

the class DbManager method getLastRepairStatus.

@Override
public DbRepairStatus getLastRepairStatus(boolean forCurrentNodesOnly) {
    try {
        DbRepairJobState state = DbRepairRunnable.queryRepairState(this.coordinator, this.schemaUtil.getKeyspaceName(), this.schemaUtil.isGeoDbsvc());
        log.info("cluster state digest stored in ZK: {}", state.getCurrentDigest());
        DbRepairStatus retState = getLastRepairStatus(state, forCurrentNodesOnly ? DbRepairRunnable.getClusterStateDigest() : null, this.repairRetryTimes);
        if (retState != null && retState.getStatus() == DbRepairStatus.Status.IN_PROGRESS) {
            // See if current state holder is still active, if not, we need to resume it
            String lockName = DbRepairRunnable.getLockName();
            InterProcessLock lock = coordinator.getLock(lockName);
            String currentHolder = DbRepairRunnable.getSelfLockNodeId(lock);
            if (currentHolder == null) {
                // No thread is actually driving the repair, we need to resume it
                if (startNodeRepair(this.schemaUtil.getKeyspaceName(), this.repairRetryTimes, false, true)) {
                    log.info("Successfully resumed a previously paused repair");
                } else {
                    log.warn("Cannot resume a previously paused repair, it could be another thread resumed and finished it");
                }
            }
        }
        return retState;
    } catch (Exception e) {
        log.error("Failed to get node repair state from ZK", e);
        return null;
    }
}
Also used : DbRepairStatus(com.emc.vipr.model.sys.recovery.DbRepairStatus) InterProcessLock(org.apache.curator.framework.recipes.locks.InterProcessLock) TimeoutException(java.util.concurrent.TimeoutException) ExecutionException(java.util.concurrent.ExecutionException)

Example 2 with DbRepairStatus

use of com.emc.vipr.model.sys.recovery.DbRepairStatus in project coprhd-controller by CoprHD.

the class DbManagerOps method waitDbRepairFinish.

public DbRepairStatus waitDbRepairFinish(boolean forCurrentStateOnly) throws Exception {
    for (int lastProgress = -1; ; Thread.sleep(1000)) {
        DbRepairStatus status = getLastRepairStatus(forCurrentStateOnly);
        if (status == null) {
            log.info("No db repair found(forCurrentStateOnly={})", forCurrentStateOnly ? "true" : "false");
            return null;
        }
        if (status.getStatus() != DbRepairStatus.Status.IN_PROGRESS) {
            log.info("Db repair(forCurrentStateOnly={}) finished with state: {}", forCurrentStateOnly ? "true" : "false", status.toString());
            return status;
        }
        int newProgress = status.getProgress();
        if (newProgress != lastProgress) {
            log.info("Db repair started at {} is in progress {}%", status.getStartTime(), newProgress);
            lastProgress = newProgress;
        }
    }
}
Also used : DbRepairStatus(com.emc.vipr.model.sys.recovery.DbRepairStatus)

Example 3 with DbRepairStatus

use of com.emc.vipr.model.sys.recovery.DbRepairStatus in project coprhd-controller by CoprHD.

the class DbManagerOps method startNodeRepairAndWaitFinish.

public void startNodeRepairAndWaitFinish(boolean canResume, boolean crossVdc) throws Exception {
    if (canResume && getLastSucceededRepairStatus(true) != null) {
        log.info("Resume last successful repair");
        return;
    }
    DbRepairStatus state = null;
    for (int i = 0; i < DB_REPAIR_MAX_RETRY_COUNT; i++) {
        startNodeRepair(canResume, crossVdc);
        state = waitDbRepairFinish(true);
        if (state != null) {
            break;
        }
        // It could be cluster state changed, so we have to wait for ANY repair to finish here
        // We don't care if it's NotFound, Success, Or Failed for other state, repair for current state is failed anyway.
        log.error("No db repair found for current cluster state, waiting for possible stale repair to finish");
        state = waitDbRepairFinish(false);
        // Trigger a new db repair
        log.info("Trigger a new db repair for current cluster state");
    }
    if (state.getStatus() == DbRepairStatus.Status.FAILED) {
        log.error("Db node repair started at {} is failed", state.getStartTime());
        throw new IllegalStateException("Repair failed");
    }
    log.info("Db node repair started at {} is finished", state.getStartTime());
}
Also used : DbRepairStatus(com.emc.vipr.model.sys.recovery.DbRepairStatus)

Example 4 with DbRepairStatus

use of com.emc.vipr.model.sys.recovery.DbRepairStatus in project coprhd-controller by CoprHD.

the class DbRepairStatusHandler method queryDbRepairStatus.

/**
 * Query repair status of dbsvc or geodbsvc from DB
 */
private DbRepairStatus queryDbRepairStatus(String svcName) throws Exception {
    int progress = -1;
    DbRepairStatus.Status status = null;
    Date startTime = null;
    Date endTime = null;
    log.info("Try to get repair status of {}", svcName);
    try (DbManagerOps dbManagerOps = new DbManagerOps(svcName)) {
        DbRepairStatus repairState = dbManagerOps.getLastRepairStatus(false);
        if (repairState != null) {
            log.info("Current repair status of {} is: {}", svcName, repairState.toString());
            progress = repairState.getProgress();
            status = repairState.getStatus();
            startTime = repairState.getStartTime();
            endTime = repairState.getLastCompletionTime();
        }
        if (endTime != null) {
            return repairState;
        }
        repairState = dbManagerOps.getLastSucceededRepairStatus(false);
        if (repairState != null) {
            log.info("Last successful repair status of {} is: {}", svcName, repairState.toString());
            progress = (progress == -1) ? repairState.getProgress() : progress;
            status = (status == null) ? repairState.getStatus() : status;
            startTime = (startTime == null) ? repairState.getStartTime() : startTime;
            endTime = (endTime == null) ? repairState.getLastCompletionTime() : endTime;
        }
    }
    if (status != null) {
        return new DbRepairStatus(status, startTime, endTime, progress);
    }
    return null;
}
Also used : DbManagerOps(com.emc.storageos.management.jmx.recovery.DbManagerOps) DbRepairStatus(com.emc.vipr.model.sys.recovery.DbRepairStatus) Date(java.util.Date)

Example 5 with DbRepairStatus

use of com.emc.vipr.model.sys.recovery.DbRepairStatus in project coprhd-controller by CoprHD.

the class DbRepairStatusHandler method getDbRepairStatus.

/**
 * Get node repair status(have combine db repair status and geodb repair status)
 * it's tricky to combine local db and geo db repair together since they can be triggered
 * individually, lots for workaround needed to be done to ensure it works correctly.
 * we set IN_PROGRESS before perform actual db repair in DbRepairRunable(before get DB_REPAIR lock)
 * hence we can use the IN_PROGRESS here to determine if there is other pending db repair,
 * so we can determine whether we can merge them together or not. For db repair triggered by scheduler,
 * geo db repair doesn't know if there is local db finished its work or not since IN_PROGRESS will be
 * set to DONE (which means geo db repair is not aware of it is triggered by restart geo service alone
 * or node restart), we use INTERVAL_TIME_IN_MINUTES to make the decision.
 * Generally we follow the below rules:
 * 1. node recovery: always merge the result such as: local db repair progress 50% itself, 25% will
 * be returned, geo db repair progress 50% itself, 75% will be returned. please
 * be aware of local db repair always come first.
 * 2. node restart: always merge the result, be aware of geo db repair by using IN_PROGRESS flag in
 * local db repair; be aware of local db repair by checking lastCompletionTime of
 * geo db repair against 3 hours
 * 3. restart one db service alone: if you restart db serivce alone, we will return local db repair
 * progress directly without any merge.
 * <p/>
 * Note: we use local db repair as the first instance to grap DB_REPAIR lock, the geo db repair is
 * the second one to run for simply introduction even if it's by chance to get DB_REPAIR lock based
 * on which one bootup first, but it doesn't affect the result.
 */
public DbRepairStatus getDbRepairStatus() throws Exception {
    DbRepairStatus repairStatus = new DbRepairStatus();
    DbRepairStatus localDbState = queryDbRepairStatus(serviceNames.get(0));
    DbRepairStatus geoDbState = queryDbRepairStatus(serviceNames.get(1));
    boolean nodeRecovery = isNodeRecoveryDbRepairInProgress();
    log.info("Query repair status of dbsvc({}) and geodbsvc({}) successfully", (localDbState == null) ? localDbState : localDbState.toString(), (geoDbState == null) ? geoDbState : geoDbState.toString());
    log.info("db repair running in node recovery? {}", nodeRecovery);
    if (localDbState == null && geoDbState == null) {
        repairStatus.setStatus(DbRepairStatus.Status.NOT_STARTED);
        return repairStatus;
    }
    if (localDbState != null && geoDbState != null) {
        if (localDbState.getStatus() == DbRepairStatus.Status.IN_PROGRESS && geoDbState.getStatus() == DbRepairStatus.Status.IN_PROGRESS) {
            log.info("local/geo db repair are in progress both");
            repairStatus = getDualProgressStatus(localDbState, geoDbState);
        } else if (localDbState.getStatus() == DbRepairStatus.Status.IN_PROGRESS) {
            log.info("local db repair is in progress");
            repairStatus = getSingleProgressStatus(localDbState, geoDbState, nodeRecovery, false);
        } else if (geoDbState.getStatus() == DbRepairStatus.Status.IN_PROGRESS) {
            log.info("geo db repair is in progress");
            repairStatus = getSingleProgressStatus(geoDbState, localDbState, nodeRecovery, true);
        } else if (localDbState.getStatus() == DbRepairStatus.Status.FAILED || geoDbState.getStatus() == DbRepairStatus.Status.FAILED) {
            log.info("local or geo db repair failed");
            repairStatus = getFailStatus(localDbState, geoDbState);
        } else if (localDbState.getStatus() == DbRepairStatus.Status.SUCCESS && geoDbState.getStatus() == DbRepairStatus.Status.SUCCESS) {
            log.info("local and geo db repair success");
            repairStatus = getSuccessStatus(localDbState, geoDbState);
        }
    }
    if (localDbState == null) {
        repairStatus = geoDbState;
    } else if (geoDbState == null) {
        repairStatus = localDbState;
    }
    log.info("Repair status is: {}", repairStatus.toString());
    return repairStatus;
}
Also used : DbRepairStatus(com.emc.vipr.model.sys.recovery.DbRepairStatus)

Aggregations

DbRepairStatus (com.emc.vipr.model.sys.recovery.DbRepairStatus)10 Date (java.util.Date)4 DateTime (org.joda.time.DateTime)2 DbManagerOps (com.emc.storageos.management.jmx.recovery.DbManagerOps)1 Restrictions (controllers.deadbolt.Restrictions)1 ExecutionException (java.util.concurrent.ExecutionException)1 TimeoutException (java.util.concurrent.TimeoutException)1 InterProcessLock (org.apache.curator.framework.recipes.locks.InterProcessLock)1