Search in sources :

Example 1 with RecoveryManager

use of org.apache.accumulo.manager.recovery.RecoveryManager in project accumulo by apache.

the class Manager method run.

@Override
public void run() {
    final ServerContext context = getContext();
    final String zroot = getZooKeeperRoot();
    // ACCUMULO-4424 Put up the Thrift servers before getting the lock as a sign of process health
    // when a hot-standby
    // 
    // Start the Manager's Client service
    clientHandler = new ManagerClientServiceHandler(this);
    // Ensure that calls before the manager gets the lock fail
    Iface haProxy = HighlyAvailableServiceWrapper.service(clientHandler, this);
    Iface rpcProxy = TraceUtil.wrapService(haProxy);
    final Processor<Iface> processor;
    if (context.getThriftServerType() == ThriftServerType.SASL) {
        Iface tcredsProxy = TCredentialsUpdatingWrapper.service(rpcProxy, clientHandler.getClass(), getConfiguration());
        processor = new Processor<>(tcredsProxy);
    } else {
        processor = new Processor<>(rpcProxy);
    }
    ServerAddress sa;
    try {
        sa = TServerUtils.startServer(context, getHostname(), Property.MANAGER_CLIENTPORT, processor, "Manager", "Manager Client Service Handler", null, Property.MANAGER_MINTHREADS, Property.MANAGER_MINTHREADS_TIMEOUT, Property.MANAGER_THREADCHECK, Property.GENERAL_MAX_MESSAGE_SIZE);
    } catch (UnknownHostException e) {
        throw new IllegalStateException("Unable to start server on host " + getHostname(), e);
    }
    clientService = sa.server;
    log.info("Started Manager client service at {}", sa.address);
    // block until we can obtain the ZK lock for the manager
    try {
        getManagerLock(ServiceLock.path(zroot + Constants.ZMANAGER_LOCK));
    } catch (KeeperException | InterruptedException e) {
        throw new IllegalStateException("Exception getting manager lock", e);
    }
    // upgrading.
    if (upgradeCoordinator.getStatus() != UpgradeCoordinator.UpgradeStatus.COMPLETE) {
        managerUpgrading.set(true);
    }
    try {
        MetricsUtil.initializeMetrics(getContext().getConfiguration(), this.applicationName, sa.getAddress());
        ManagerMetrics.init(getConfiguration(), this);
    } catch (Exception e1) {
        log.error("Error initializing metrics, metrics will not be emitted.", e1);
    }
    recoveryManager = new RecoveryManager(this, TIME_TO_CACHE_RECOVERY_WAL_EXISTENCE);
    context.getTableManager().addObserver(this);
    Thread statusThread = Threads.createThread("Status Thread", new StatusThread());
    statusThread.start();
    Threads.createThread("Migration Cleanup Thread", new MigrationCleanupThread()).start();
    tserverSet.startListeningForTabletServerChanges();
    try {
        blockForTservers();
    } catch (InterruptedException ex) {
        Thread.currentThread().interrupt();
    }
    ZooReaderWriter zReaderWriter = context.getZooReaderWriter();
    try {
        zReaderWriter.getChildren(zroot + Constants.ZRECOVERY, new Watcher() {

            @Override
            public void process(WatchedEvent event) {
                nextEvent.event("Noticed recovery changes %s", event.getType());
                try {
                    // watcher only fires once, add it back
                    zReaderWriter.getChildren(zroot + Constants.ZRECOVERY, this);
                } catch (Exception e) {
                    log.error("Failed to add log recovery watcher back", e);
                }
            }
        });
    } catch (KeeperException | InterruptedException e) {
        throw new IllegalStateException("Unable to read " + zroot + Constants.ZRECOVERY, e);
    }
    watchers.add(new TabletGroupWatcher(this, TabletStateStore.getStoreForLevel(DataLevel.USER, context, this), null) {

        @Override
        boolean canSuspendTablets() {
            // Always allow user data tablets to enter suspended state.
            return true;
        }
    });
    watchers.add(new TabletGroupWatcher(this, TabletStateStore.getStoreForLevel(DataLevel.METADATA, context, this), watchers.get(0)) {

        @Override
        boolean canSuspendTablets() {
            // setting.
            return getConfiguration().getBoolean(Property.MANAGER_METADATA_SUSPENDABLE);
        }
    });
    watchers.add(new TabletGroupWatcher(this, TabletStateStore.getStoreForLevel(DataLevel.ROOT, context), watchers.get(1)) {

        @Override
        boolean canSuspendTablets() {
            // Never allow root tablet to enter suspended state.
            return false;
        }
    });
    for (TabletGroupWatcher watcher : watchers) {
        watcher.start();
    }
    // Once we are sure the upgrade is complete, we can safely allow fate use.
    try {
        // wait for metadata upgrade running in background to complete
        if (null != upgradeMetadataFuture) {
            upgradeMetadataFuture.get();
        }
        // Everything is fully upgraded by this point.
        managerUpgrading.set(false);
    } catch (ExecutionException | InterruptedException e) {
        throw new IllegalStateException("Metadata upgrade failed", e);
    }
    try {
        final AgeOffStore<Manager> store = new AgeOffStore<>(new org.apache.accumulo.fate.ZooStore<>(getZooKeeperRoot() + Constants.ZFATE, context.getZooReaderWriter()), TimeUnit.HOURS.toMillis(8), System::currentTimeMillis);
        fate = new Fate<>(this, store, TraceRepo::toLogString);
        fate.startTransactionRunners(getConfiguration());
        context.getScheduledExecutor().scheduleWithFixedDelay(store::ageOff, 63000, 63000, TimeUnit.MILLISECONDS);
    } catch (KeeperException | InterruptedException e) {
        throw new IllegalStateException("Exception setting up FaTE cleanup thread", e);
    }
    initializeZkForReplication(zReaderWriter, zroot);
    // Make sure that we have a secret key (either a new one or an old one from ZK) before we start
    // the manager client service.
    Thread authenticationTokenKeyManagerThread = null;
    if (authenticationTokenKeyManager != null && keyDistributor != null) {
        log.info("Starting delegation-token key manager");
        try {
            keyDistributor.initialize();
        } catch (KeeperException | InterruptedException e) {
            throw new IllegalStateException("Exception setting up delegation-token key manager", e);
        }
        authenticationTokenKeyManagerThread = Threads.createThread("Delegation Token Key Manager", authenticationTokenKeyManager);
        authenticationTokenKeyManagerThread.start();
        boolean logged = false;
        while (!authenticationTokenKeyManager.isInitialized()) {
            // Print out a status message when we start waiting for the key manager to get initialized
            if (!logged) {
                log.info("Waiting for AuthenticationTokenKeyManager to be initialized");
                logged = true;
            }
            sleepUninterruptibly(200, TimeUnit.MILLISECONDS);
        }
        // And log when we are initialized
        log.info("AuthenticationTokenSecretManager is initialized");
    }
    String address = sa.address.toString();
    log.info("Setting manager lock data to {}", address);
    try {
        managerLock.replaceLockData(address.getBytes());
    } catch (KeeperException | InterruptedException e) {
        throw new IllegalStateException("Exception updating manager lock", e);
    }
    while (!clientService.isServing()) {
        sleepUninterruptibly(100, TimeUnit.MILLISECONDS);
    }
    // if the replication name is ever set, then start replication services
    final AtomicReference<TServer> replServer = new AtomicReference<>();
    context.getScheduledExecutor().scheduleWithFixedDelay(() -> {
        try {
            @SuppressWarnings("deprecation") Property p = Property.REPLICATION_NAME;
            if ((replServer.get() == null) && !getConfiguration().get(p).isEmpty()) {
                log.info("{} was set, starting repl services.", p.getKey());
                replServer.set(setupReplication());
            }
        } catch (UnknownHostException | KeeperException | InterruptedException e) {
            log.error("Error occurred starting replication services. ", e);
        }
    }, 0, 5000, TimeUnit.MILLISECONDS);
    // checking stored user hashes if any of them uses an outdated algorithm
    security.validateStoredUserCreditentials();
    // The manager is fully initialized. Clients are allowed to connect now.
    managerInitialized.set(true);
    while (clientService.isServing()) {
        sleepUninterruptibly(500, TimeUnit.MILLISECONDS);
    }
    log.info("Shutting down fate.");
    fate.shutdown();
    final long deadline = System.currentTimeMillis() + MAX_CLEANUP_WAIT_TIME;
    try {
        statusThread.join(remaining(deadline));
        if (null != replicationAssignerThread) {
            replicationAssignerThread.join(remaining(deadline));
        }
        if (null != replicationWorkThread) {
            replicationWorkThread.join(remaining(deadline));
        }
    } catch (InterruptedException e) {
        throw new IllegalStateException("Exception stopping replication workers", e);
    }
    var nullableReplServer = replServer.get();
    if (nullableReplServer != null) {
        nullableReplServer.stop();
    }
    // Signal that we want it to stop, and wait for it to do so.
    if (authenticationTokenKeyManager != null) {
        authenticationTokenKeyManager.gracefulStop();
        try {
            if (null != authenticationTokenKeyManagerThread) {
                authenticationTokenKeyManagerThread.join(remaining(deadline));
            }
        } catch (InterruptedException e) {
            throw new IllegalStateException("Exception waiting on delegation-token key manager", e);
        }
    }
    // don't stop
    for (TabletGroupWatcher watcher : watchers) {
        try {
            watcher.join(remaining(deadline));
        } catch (InterruptedException e) {
            throw new IllegalStateException("Exception waiting on watcher", e);
        }
    }
    log.info("exiting");
}
Also used : TServer(org.apache.thrift.server.TServer) ServerAddress(org.apache.accumulo.server.rpc.ServerAddress) Watcher(org.apache.zookeeper.Watcher) VolumeManager(org.apache.accumulo.server.fs.VolumeManager) AuthenticationTokenSecretManager(org.apache.accumulo.server.security.delegation.AuthenticationTokenSecretManager) RecoveryManager(org.apache.accumulo.manager.recovery.RecoveryManager) AuthenticationTokenKeyManager(org.apache.accumulo.server.security.delegation.AuthenticationTokenKeyManager) TableManager(org.apache.accumulo.server.tables.TableManager) WatchedEvent(org.apache.zookeeper.WatchedEvent) Iface(org.apache.accumulo.core.manager.thrift.ManagerClientService.Iface) ExecutionException(java.util.concurrent.ExecutionException) AgeOffStore(org.apache.accumulo.fate.AgeOffStore) Property(org.apache.accumulo.core.conf.Property) UnknownHostException(java.net.UnknownHostException) ZooReaderWriter(org.apache.accumulo.fate.zookeeper.ZooReaderWriter) AtomicReference(java.util.concurrent.atomic.AtomicReference) TableNotFoundException(org.apache.accumulo.core.client.TableNotFoundException) NoAuthException(org.apache.zookeeper.KeeperException.NoAuthException) TException(org.apache.thrift.TException) IOException(java.io.IOException) UnknownHostException(java.net.UnknownHostException) ExecutionException(java.util.concurrent.ExecutionException) TTransportException(org.apache.thrift.transport.TTransportException) KeeperException(org.apache.zookeeper.KeeperException) ThriftTableOperationException(org.apache.accumulo.core.clientImpl.thrift.ThriftTableOperationException) RecoveryManager(org.apache.accumulo.manager.recovery.RecoveryManager) ServerContext(org.apache.accumulo.server.ServerContext) KeeperException(org.apache.zookeeper.KeeperException)

Aggregations

IOException (java.io.IOException)1 UnknownHostException (java.net.UnknownHostException)1 ExecutionException (java.util.concurrent.ExecutionException)1 AtomicReference (java.util.concurrent.atomic.AtomicReference)1 TableNotFoundException (org.apache.accumulo.core.client.TableNotFoundException)1 ThriftTableOperationException (org.apache.accumulo.core.clientImpl.thrift.ThriftTableOperationException)1 Property (org.apache.accumulo.core.conf.Property)1 Iface (org.apache.accumulo.core.manager.thrift.ManagerClientService.Iface)1 AgeOffStore (org.apache.accumulo.fate.AgeOffStore)1 ZooReaderWriter (org.apache.accumulo.fate.zookeeper.ZooReaderWriter)1 RecoveryManager (org.apache.accumulo.manager.recovery.RecoveryManager)1 ServerContext (org.apache.accumulo.server.ServerContext)1 VolumeManager (org.apache.accumulo.server.fs.VolumeManager)1 ServerAddress (org.apache.accumulo.server.rpc.ServerAddress)1 AuthenticationTokenKeyManager (org.apache.accumulo.server.security.delegation.AuthenticationTokenKeyManager)1 AuthenticationTokenSecretManager (org.apache.accumulo.server.security.delegation.AuthenticationTokenSecretManager)1 TableManager (org.apache.accumulo.server.tables.TableManager)1 TException (org.apache.thrift.TException)1 TServer (org.apache.thrift.server.TServer)1 TTransportException (org.apache.thrift.transport.TTransportException)1