Search in sources :

Example 21 with InterProcessLock

use of org.apache.curator.framework.recipes.locks.InterProcessLock in project coprhd-controller by CoprHD.

the class DbServiceImpl method start.

@Override
public void start() throws IOException {
    if (_log.isInfoEnabled()) {
        _log.info("Starting DB service...");
    }
    // Suppress Sonar violation of Lazy initialization of static fields should be synchronized
    // start() method will be only called one time when startup dbsvc, so it's safe to ignore sonar violation
    // NOSONAR ("squid:S2444")
    instance = this;
    if (backCompatPreYoda) {
        _log.info("Pre-yoda back compatible flag detected. Initialize local keystore/truststore for Cassandra native encryption");
        initKeystoreAndTruststore();
        _schemaUtil.setBackCompatPreYoda(true);
    }
    System.setProperty("cassandra.config", _config);
    System.setProperty("cassandra.config.loader", CassandraConfigLoader.class.getName());
    // Set to false to clear all gossip state for the node on restart.
    // 
    // We encounter a weird Cassandra grossip issue(COP-19246) - some nodes are missing from gossip
    // when rebooting the entire cluster simultaneously. Critical Gossip fields(ApplicationState.STATUS, ApplicationState.TOKENS)
    // are not synchronized during handshaking. It looks like some problem caused by incorrect gossip version/generation
    // at system local table. So add this option to cleanup local gossip state during reboot
    // 
    // Make sure add-vdc/add-standby passed when you would remove this option in the future.
    // 
    // We need make sure majority local nodes are added as seed nodes. Otherwise cassandra may not see other nodes if it loses
    // connection to other sites
    System.setProperty("cassandra.load_ring_state", "false");
    // See https://docs.datastax.com/en/cassandra/2.0/cassandra/operations/ops_add_dc_to_cluster_t.html
    if (_schemaUtil.isStandby()) {
        System.setProperty("cassandra.auto_bootstrap", "false");
    }
    InterProcessLock lock = null;
    Configuration config = null;
    StartupMode mode = null;
    try {
        // we use this lock to discourage more than one node bootstrapping / joining at the same time
        // Cassandra can handle this but it's generally not recommended to make changes to schema concurrently
        lock = getLock(getSchemaLockName());
        config = checkConfiguration();
        checkGlobalConfiguration();
        checkVersionedConfiguration();
        removeStaleConfiguration();
        mode = checkStartupMode(config);
        _log.info("Current startup mode is {}", mode);
        // Check if service is allowed to get started by querying db offline info to avoid bringing back stale data.
        // Skipping hibernate mode for node recovery procedure to recover the overdue node.
        int nodeCount = ((CoordinatorClientImpl) _coordinator).getNodeCount();
        if (nodeCount != 1 && mode.type != StartupMode.StartupModeType.HIBERNATE_MODE) {
            checkDBOfflineInfo(_coordinator, _serviceInfo.getName(), dbDir, true);
        }
        // this call causes instantiation of a seed provider instance, so the check*Configuration
        // calls must be preceed it
        removeCassandraSavedCaches();
        mode.onPreStart();
        if (_jmxServer != null) {
            _jmxServer.start();
            System.setProperty("com.sun.management.jmxremote.port", Integer.toString(_jmxServer.getPort()));
        }
        _service = new CassandraDaemon();
        _service.init(null);
        _service.start();
        cassandraInitialized = true;
        mode.onPostStart();
    } catch (Exception e) {
        if (mode != null && mode.type == StartupMode.StartupModeType.HIBERNATE_MODE) {
            printRecoveryWorkAround(e);
        }
        _log.error("e=", e);
        throw new IllegalStateException(e);
    } finally {
        if (lock != null) {
            try {
                lock.release();
            } catch (Exception ignore) {
                _log.debug("lock release failed");
            }
        }
    }
    if (config.getConfig(DbConfigConstants.JOINED) == null) {
        config.setConfig(DbConfigConstants.JOINED, Boolean.TRUE.toString());
        _coordinator.persistServiceConfiguration(_coordinator.getSiteId(), config);
    }
    _statusChecker.waitForAllNodesJoined();
    _svcBeacon.start();
    if (backCompatPreYoda) {
        _log.info("Enable duplicated beacon in global area during pre-yoda upgrade");
        startDupBeacon();
    }
    setDbInitializedFlag();
    setDbConfigInitDone();
    _dbClient.start();
    if (_schemaUtil.isStandby()) {
        String localDataRevision = getLocalDataRevision();
        if (localDataRevision != null) {
            _schemaUtil.checkDataRevision(localDataRevision);
        }
    }
    // Setup the vdc information, so that login enabled before migration
    if (!isGeoDbsvc()) {
        _schemaUtil.checkAndSetupBootStrapInfo(_dbClient);
    }
    dbMgr.init();
    if (_handler.run()) {
        // Setup the bootstrap info root tenant, if root tenant migrated from local db, then skip it
        if (isGeoDbsvc()) {
            _schemaUtil.checkAndSetupBootStrapInfo(_dbClient);
        } else {
            _schemaUtil.checkAndInitStorageSystemTypes(_dbClient);
        }
        startBackgroundTasks();
        _log.info("DB service started");
    } else {
        _log.error("DB migration failed. Skipping starting background tasks.");
    }
}
Also used : CoordinatorClientImpl(com.emc.storageos.coordinator.client.service.impl.CoordinatorClientImpl) Configuration(com.emc.storageos.coordinator.common.Configuration) InterProcessLock(org.apache.curator.framework.recipes.locks.InterProcessLock) IOException(java.io.IOException) CassandraDaemon(org.apache.cassandra.service.CassandraDaemon)

Example 22 with InterProcessLock

use of org.apache.curator.framework.recipes.locks.InterProcessLock in project coprhd-controller by CoprHD.

the class MigrationHandlerImpl method run.

/**
 */
@Override
public boolean run() throws DatabaseException {
    Date startTime = new Date();
    // set state to migration_init and wait for all nodes to reach this state
    setDbConfig(DbConfigConstants.MIGRATION_INIT);
    targetVersion = service.getVersion();
    statusChecker.setVersion(targetVersion);
    statusChecker.setServiceName(service.getName());
    // dbsvc will wait for all dbsvc, and geodbsvc waits for all geodbsvc.
    statusChecker.waitForAllNodesMigrationInit();
    if (schemaUtil.isStandby()) {
        String currentSchemaVersion = coordinator.getCurrentDbSchemaVersion();
        if (!StringUtils.equals(currentSchemaVersion, targetVersion)) {
            // no migration on standby site
            log.info("Migration does not run on standby. Change current version to {}", targetVersion);
            schemaUtil.setCurrentVersion(targetVersion);
        }
        return true;
    }
    if (schemaUtil.isGeoDbsvc()) {
        boolean schemaVersionChanged = isDbSchemaVersionChanged();
        // scan and update cassandra schema
        checkGeoDbSchema();
        // no migration procedure for geosvc, just wait till migration is done on one of the
        // dbsvcs
        log.warn("Migration is not supported for Geodbsvc. Wait till migration is done");
        statusChecker.waitForMigrationDone();
        // Update vdc version
        if (schemaVersionChanged) {
            schemaUtil.insertOrUpdateVdcVersion(dbClient, true);
        }
        return true;
    } else {
        // for dbsvc, we have to wait till all geodbsvc becomes migration_init since we might
        // need to copy geo-replicated resources from local to geo db.
        statusChecker.waitForAllNodesMigrationInit(Constants.GEODBSVC_NAME);
    }
    InterProcessLock lock = null;
    String currentSchemaVersion = null;
    int retryCount = 0;
    while (retryCount < MAX_MIGRATION_RETRY) {
        log.debug("Migration handlers - Start. Trying to grab lock ...");
        try {
            // grab global lock for migration
            lock = getLock(DB_MIGRATION_LOCK);
            // make sure we haven't finished the migration on another node already
            MigrationStatus status = coordinator.getMigrationStatus();
            if (status != null) {
                if (status == MigrationStatus.DONE) {
                    log.info("DB migration is done already. Skipping...");
                    if (null == getPersistedSchema(targetVersion)) {
                        persistSchema(targetVersion, DbSchemaChecker.marshalSchemas(currentSchema, null));
                    }
                    return true;
                } else if (status == MigrationStatus.FAILED) {
                    log.error("DB migration is done already with status:{}. ", status);
                    return false;
                }
            }
            schemaUtil.setMigrationStatus(MigrationStatus.RUNNING);
            // we expect currentSchemaVersion to be set
            currentSchemaVersion = coordinator.getCurrentDbSchemaVersion();
            if (currentSchemaVersion == null) {
                throw new IllegalStateException("Schema version not set");
            }
            // figure out our source and target versions
            DbSchemas persistedSchema = getPersistedSchema(currentSchemaVersion);
            if (isSchemaMissed(persistedSchema, currentSchemaVersion, targetVersion)) {
                throw new IllegalStateException("Schema definition not found for version " + currentSchemaVersion);
            }
            if (isFreshInstall(persistedSchema, currentSchemaVersion, targetVersion)) {
                log.info("saving schema of version {} to db", currentSchemaVersion);
                persistedSchema = currentSchema;
                persistSchema(currentSchemaVersion, DbSchemaChecker.marshalSchemas(persistedSchema, null));
            }
            // check if we have a schema upgrade to deal with
            if (!currentSchemaVersion.equals(targetVersion)) {
                log.info("Start scanning and creating new column families");
                schemaUtil.checkCf(true);
                log.info("Scanning and creating new column families succeed");
                DbSchemasDiff diff = new DbSchemasDiff(persistedSchema, currentSchema, ignoredPkgs);
                if (diff.isChanged()) {
                    // log the changes
                    dumpChanges(diff);
                    if (!diff.isUpgradable()) {
                        // we should never be here, but, if we are here, throw an IllegalStateException and stop
                        // To Do - dump the problematic diffs here
                        log.error("schema diff details: {}", DbSchemaChecker.marshalSchemasDiff(diff));
                        throw new IllegalStateException("schema not upgradable.");
                    }
                }
                log.info("Starting migration callbacks from {} to {}", currentSchemaVersion, targetVersion);
                // we need to check point the progress of these callbacks as they are run,
                // so we can resume from where we left off in case of restarts/errors
                String checkpoint = schemaUtil.getMigrationCheckpoint();
                if (checkpoint != null) {
                    log.info("Migration checkpoint found for {}", checkpoint);
                }
                // run all migration callbacks
                runMigrationCallbacks(diff, checkpoint);
                log.info("Done migration callbacks");
                persistSchema(targetVersion, DbSchemaChecker.marshalSchemas(currentSchema, null));
                schemaUtil.dropUnusedCfsIfExists();
                // set current version in zk
                schemaUtil.setCurrentVersion(targetVersion);
                log.info("current schema version is updated to {}", targetVersion);
            }
            schemaUtil.setMigrationStatus(MigrationStatus.DONE);
            // Remove migration checkpoint after done
            schemaUtil.removeMigrationCheckpoint();
            removeMigrationFailInfoIfExist();
            log.debug("Migration handler - Done.");
            return true;
        } catch (Exception e) {
            if (e instanceof MigrationCallbackException) {
                markMigrationFailure(startTime, currentSchemaVersion, e);
            } else if (isUnRetryableException(e)) {
                markMigrationFailure(startTime, currentSchemaVersion, e);
                return false;
            } else {
                log.warn("Retryable exception during migration ", e);
                retryCount++;
                lastException = e;
            }
        } finally {
            if (lock != null) {
                try {
                    lock.release();
                } catch (Exception ignore) {
                    log.debug("lock release failed");
                }
            }
        }
        sleepBeforeRetry();
    }
    // while -- not done
    markMigrationFailure(startTime, currentSchemaVersion, lastException);
    return false;
}
Also used : MigrationCallbackException(com.emc.storageos.svcs.errorhandling.resources.MigrationCallbackException) DbSchemasDiff(com.emc.storageos.db.common.diff.DbSchemasDiff) InterProcessLock(org.apache.curator.framework.recipes.locks.InterProcessLock) DbSchemas(com.emc.storageos.db.common.schema.DbSchemas) MigrationStatus(com.emc.storageos.coordinator.client.model.MigrationStatus) MigrationCallbackException(com.emc.storageos.svcs.errorhandling.resources.MigrationCallbackException) DatabaseException(com.emc.storageos.db.exceptions.DatabaseException) FatalCoordinatorException(com.emc.storageos.coordinator.exceptions.FatalCoordinatorException) FatalDatabaseException(com.emc.storageos.db.exceptions.FatalDatabaseException)

Example 23 with InterProcessLock

use of org.apache.curator.framework.recipes.locks.InterProcessLock in project coprhd-controller by CoprHD.

the class MigrationHandlerImpl method checkGeoDbSchema.

private void checkGeoDbSchema() {
    String targetVersion = service.getVersion();
    if (isDbSchemaVersionChanged() && !VdcUtil.checkGeoCompatibleOfOtherVdcs(targetVersion)) {
        log.info("Not all vdc are upgraded. Skip geodb schema change until all vdc are upgraded");
        return;
    }
    log.info("Start scanning and creating new column families");
    InterProcessLock lock = null;
    try {
        String lockName = DbConfigConstants.GEODB_SCHEMA_LOCK;
        // grab global lock for migration
        lock = getLock(lockName);
        schemaUtil.checkCf();
        log.info("Scanning and creating new column families succeed");
    } catch (Exception ex) {
        log.warn("Unexpected error when scan db schema", ex);
    } finally {
        if (lock != null) {
            try {
                lock.release();
            } catch (Exception ignore) {
                log.debug("lock release failed");
            }
        }
    }
}
Also used : InterProcessLock(org.apache.curator.framework.recipes.locks.InterProcessLock) MigrationCallbackException(com.emc.storageos.svcs.errorhandling.resources.MigrationCallbackException) DatabaseException(com.emc.storageos.db.exceptions.DatabaseException) FatalCoordinatorException(com.emc.storageos.coordinator.exceptions.FatalCoordinatorException) FatalDatabaseException(com.emc.storageos.db.exceptions.FatalDatabaseException)

Example 24 with InterProcessLock

use of org.apache.curator.framework.recipes.locks.InterProcessLock in project coprhd-controller by CoprHD.

the class SchemaUtil method checkAndSetupBootStrapInfo.

/**
 * Init the bootstrap info, including:
 * check and setup root tenant or my vdc info, if it doesn't exist
 */
public void checkAndSetupBootStrapInfo(DbClient dbClient) {
    // Standby site need not do the bootstrap
    if (onStandby) {
        _log.info("Skip boot strap info initialization on standby site");
        return;
    }
    // Only the first VDC need check root tenant
    if (_vdcList != null && _vdcList.size() > 1) {
        _log.info("Skip root tenant check for more than one vdcs. Current number of vdcs: {}", _vdcList.size());
        return;
    }
    int retryIntervalSecs = DBINIT_RETRY_INTERVAL;
    boolean done = false;
    boolean wait;
    while (!done) {
        wait = false;
        InterProcessLock lock = null;
        try {
            lock = _coordinator.getLock(getBootstrapLockName());
            _log.info("bootstrap info check - waiting for bootstrap lock");
            lock.acquire();
            if (isGeoDbsvc()) {
                // insert root tenant if not exist for geodb
                insertDefaultRootTenant(dbClient);
            } else {
                // insert default vdc info if not exist for local db
                insertMyVdcInfo(dbClient);
                // insert VdcVersion if not exist for geo db, don't insert in geo db to avoid race condition.
                insertVdcVersion(dbClient);
                // insert local user's password history if not exist for local db
                insertPasswordHistory(dbClient);
            }
            done = true;
        } catch (Exception e) {
            if (e instanceof IllegalStateException) {
                throw (IllegalStateException) e;
            } else {
                _log.warn("Exception while checking for bootstrap info, will retry in {} secs", retryIntervalSecs, e);
                wait = true;
            }
        } finally {
            if (lock != null) {
                try {
                    lock.release();
                } catch (Exception e) {
                    _log.error("Fail to release lock", e);
                }
            }
        }
        if (wait) {
            try {
                Thread.sleep(retryIntervalSecs * 1000);
            } catch (InterruptedException ex) {
                _log.warn("Thread is interrupted during wait for retry", ex);
            }
        }
    }
}
Also used : InterProcessLock(org.apache.curator.framework.recipes.locks.InterProcessLock) AlternateIdConstraint(com.emc.storageos.db.client.constraint.AlternateIdConstraint) ContainmentConstraint(com.emc.storageos.db.client.constraint.ContainmentConstraint) DatabaseException(com.emc.storageos.db.exceptions.DatabaseException) OperationException(com.netflix.astyanax.connectionpool.exceptions.OperationException) ConnectionException(com.netflix.astyanax.connectionpool.exceptions.ConnectionException) UnknownHostException(java.net.UnknownHostException)

Example 25 with InterProcessLock

use of org.apache.curator.framework.recipes.locks.InterProcessLock in project coprhd-controller by CoprHD.

the class CoordinatorClientImpl method checkAndCreateSiteSpecificSection.

/**
 * Check and initialize site specific section for current site. If site specific section is empty,
 * we always assume current site is active site
 *
 * @throws Exception
 */
private void checkAndCreateSiteSpecificSection() throws Exception {
    if (isSiteSpecificSectionInited()) {
        log.info("Site specific section for {} initialized", getSiteId());
        return;
    }
    log.info("The site specific section has NOT been initialized");
    InterProcessLock lock = getLock(ZkPath.SITES.name());
    try {
        lock.acquire();
        if (!isSiteSpecificSectionInited()) {
            createSiteSpecificSection();
        }
    } catch (Exception e) {
        log.error("Failed to initialize site specific area for {}.", ZkPath.SITES, e);
        throw e;
    } finally {
        try {
            lock.release();
        } catch (Exception e) {
            log.error("Failed to release the lock for {}. Error {}", ZkPath.SITES, e);
        }
    }
}
Also used : InterProcessLock(org.apache.curator.framework.recipes.locks.InterProcessLock) CoordinatorException(com.emc.storageos.coordinator.exceptions.CoordinatorException) RetryableCoordinatorException(com.emc.storageos.coordinator.exceptions.RetryableCoordinatorException) KeeperException(org.apache.zookeeper.KeeperException) IOException(java.io.IOException) UnknownHostException(java.net.UnknownHostException)

Aggregations

InterProcessLock (org.apache.curator.framework.recipes.locks.InterProcessLock)98 APIException (com.emc.storageos.svcs.errorhandling.resources.APIException)25 DatabaseException (com.emc.storageos.db.exceptions.DatabaseException)21 DeviceControllerException (com.emc.storageos.exceptions.DeviceControllerException)15 IOException (java.io.IOException)15 ControllerException (com.emc.storageos.volumecontroller.ControllerException)14 Configuration (com.emc.storageos.coordinator.common.Configuration)12 CoordinatorException (com.emc.storageos.coordinator.exceptions.CoordinatorException)12 UnknownHostException (java.net.UnknownHostException)12 Site (com.emc.storageos.coordinator.client.model.Site)11 RetryableCoordinatorException (com.emc.storageos.coordinator.exceptions.RetryableCoordinatorException)11 NetworkDeviceControllerException (com.emc.storageos.networkcontroller.exceptions.NetworkDeviceControllerException)10 CheckPermission (com.emc.storageos.security.authorization.CheckPermission)9 ServiceError (com.emc.storageos.svcs.errorhandling.model.ServiceError)9 BiosCommandResult (com.emc.storageos.volumecontroller.impl.BiosCommandResult)9 ArrayList (java.util.ArrayList)9 POST (javax.ws.rs.POST)9 NetworkSystem (com.emc.storageos.db.client.model.NetworkSystem)8 Path (javax.ws.rs.Path)8 ConfigurationImpl (com.emc.storageos.coordinator.common.impl.ConfigurationImpl)6