Search in sources :

Example 1 with Metrics

use of org.apache.accumulo.server.metrics.Metrics in project accumulo by apache.

the class Master method run.

public void run() throws IOException, InterruptedException, KeeperException {
    final String zroot = ZooUtil.getRoot(getInstance());
    // ACCUMULO-4424 Put up the Thrift servers before getting the lock as a sign of process health when a hot-standby
    // 
    // Start the Master's Client service
    clientHandler = new MasterClientServiceHandler(this);
    // Ensure that calls before the master gets the lock fail
    Iface haProxy = HighlyAvailableServiceWrapper.service(clientHandler, this);
    Iface rpcProxy = RpcWrapper.service(haProxy);
    final Processor<Iface> processor;
    if (ThriftServerType.SASL == getThriftServerType()) {
        Iface tcredsProxy = TCredentialsUpdatingWrapper.service(rpcProxy, clientHandler.getClass(), getConfiguration());
        processor = new Processor<>(tcredsProxy);
    } else {
        processor = new Processor<>(rpcProxy);
    }
    ServerAddress sa = TServerUtils.startServer(this, hostname, Property.MASTER_CLIENTPORT, processor, "Master", "Master Client Service Handler", null, Property.MASTER_MINTHREADS, Property.MASTER_THREADCHECK, Property.GENERAL_MAX_MESSAGE_SIZE);
    clientService = sa.server;
    log.info("Started Master client service at {}", sa.address);
    // Start the replication coordinator which assigns tservers to service replication requests
    MasterReplicationCoordinator impl = new MasterReplicationCoordinator(this);
    ReplicationCoordinator.Iface haReplicationProxy = HighlyAvailableServiceWrapper.service(impl, this);
    ReplicationCoordinator.Processor<ReplicationCoordinator.Iface> replicationCoordinatorProcessor = new ReplicationCoordinator.Processor<>(RpcWrapper.service(haReplicationProxy));
    ServerAddress replAddress = TServerUtils.startServer(this, hostname, Property.MASTER_REPLICATION_COORDINATOR_PORT, replicationCoordinatorProcessor, "Master Replication Coordinator", "Replication Coordinator", null, Property.MASTER_REPLICATION_COORDINATOR_MINTHREADS, Property.MASTER_REPLICATION_COORDINATOR_THREADCHECK, Property.GENERAL_MAX_MESSAGE_SIZE);
    log.info("Started replication coordinator service at " + replAddress.address);
    // block until we can obtain the ZK lock for the master
    getMasterLock(zroot + Constants.ZMASTER_LOCK);
    recoveryManager = new RecoveryManager(this);
    TableManager.getInstance().addObserver(this);
    StatusThread statusThread = new StatusThread();
    statusThread.start();
    MigrationCleanupThread migrationCleanupThread = new MigrationCleanupThread();
    migrationCleanupThread.start();
    tserverSet.startListeningForTabletServerChanges();
    ZooReaderWriter zReaderWriter = ZooReaderWriter.getInstance();
    zReaderWriter.getChildren(zroot + Constants.ZRECOVERY, new Watcher() {

        @Override
        public void process(WatchedEvent event) {
            nextEvent.event("Noticed recovery changes", event.getType());
            try {
                // watcher only fires once, add it back
                ZooReaderWriter.getInstance().getChildren(zroot + Constants.ZRECOVERY, this);
            } catch (Exception e) {
                log.error("Failed to add log recovery watcher back", e);
            }
        }
    });
    watchers.add(new TabletGroupWatcher(this, new MetaDataStateStore(this, this), null) {

        @Override
        boolean canSuspendTablets() {
            // Always allow user data tablets to enter suspended state.
            return true;
        }
    });
    watchers.add(new TabletGroupWatcher(this, new RootTabletStateStore(this, this), watchers.get(0)) {

        @Override
        boolean canSuspendTablets() {
            // be immediately reassigned, even if there's a global table.suspension.duration setting.
            return getConfiguration().getBoolean(Property.MASTER_METADATA_SUSPENDABLE);
        }
    });
    watchers.add(new TabletGroupWatcher(this, new ZooTabletStateStore(new ZooStore(zroot)), watchers.get(1)) {

        @Override
        boolean canSuspendTablets() {
            // Never allow root tablet to enter suspended state.
            return false;
        }
    });
    for (TabletGroupWatcher watcher : watchers) {
        watcher.start();
    }
    // Once we are sure the upgrade is complete, we can safely allow fate use.
    waitForMetadataUpgrade.await();
    try {
        final AgeOffStore<Master> store = new AgeOffStore<>(new org.apache.accumulo.fate.ZooStore<Master>(ZooUtil.getRoot(getInstance()) + Constants.ZFATE, ZooReaderWriter.getInstance()), 1000 * 60 * 60 * 8);
        int threads = getConfiguration().getCount(Property.MASTER_FATE_THREADPOOL_SIZE);
        fate = new Fate<>(this, store);
        fate.startTransactionRunners(threads);
        SimpleTimer.getInstance(getConfiguration()).schedule(new Runnable() {

            @Override
            public void run() {
                store.ageOff();
            }
        }, 63000, 63000);
    } catch (KeeperException | InterruptedException e) {
        throw new IOException(e);
    }
    ZooKeeperInitialization.ensureZooKeeperInitialized(zReaderWriter, zroot);
    // the master client service.
    if (null != authenticationTokenKeyManager && null != keyDistributor) {
        log.info("Starting delegation-token key manager");
        keyDistributor.initialize();
        authenticationTokenKeyManager.start();
        boolean logged = false;
        while (!authenticationTokenKeyManager.isInitialized()) {
            // Print out a status message when we start waiting for the key manager to get initialized
            if (!logged) {
                log.info("Waiting for AuthenticationTokenKeyManager to be initialized");
                logged = true;
            }
            sleepUninterruptibly(200, TimeUnit.MILLISECONDS);
        }
        // And log when we are initialized
        log.info("AuthenticationTokenSecretManager is initialized");
    }
    String address = sa.address.toString();
    log.info("Setting master lock data to {}", address);
    masterLock.replaceLockData(address.getBytes());
    while (!clientService.isServing()) {
        sleepUninterruptibly(100, TimeUnit.MILLISECONDS);
    }
    // Start the daemon to scan the replication table and make units of work
    replicationWorkDriver = new ReplicationDriver(this);
    replicationWorkDriver.start();
    // Start the daemon to assign work to tservers to replicate to our peers
    try {
        replicationWorkAssigner = new WorkDriver(this);
    } catch (AccumuloException | AccumuloSecurityException e) {
        log.error("Caught exception trying to initialize replication WorkDriver", e);
        throw new RuntimeException(e);
    }
    replicationWorkAssigner.start();
    // Advertise that port we used so peers don't have to be told what it is
    ZooReaderWriter.getInstance().putPersistentData(ZooUtil.getRoot(getInstance()) + Constants.ZMASTER_REPLICATION_COORDINATOR_ADDR, replAddress.address.toString().getBytes(UTF_8), NodeExistsPolicy.OVERWRITE);
    // Register replication metrics
    MasterMetricsFactory factory = new MasterMetricsFactory(getConfiguration(), this);
    Metrics replicationMetrics = factory.createReplicationMetrics();
    try {
        replicationMetrics.register();
    } catch (Exception e) {
        log.error("Failed to register replication metrics", e);
    }
    // The master is fully initialized. Clients are allowed to connect now.
    masterInitialized.set(true);
    while (clientService.isServing()) {
        sleepUninterruptibly(500, TimeUnit.MILLISECONDS);
    }
    log.info("Shutting down fate.");
    fate.shutdown();
    log.info("Shutting down timekeeping.");
    timeKeeper.shutdown();
    final long deadline = System.currentTimeMillis() + MAX_CLEANUP_WAIT_TIME;
    statusThread.join(remaining(deadline));
    replicationWorkAssigner.join(remaining(deadline));
    replicationWorkDriver.join(remaining(deadline));
    replAddress.server.stop();
    // Signal that we want it to stop, and wait for it to do so.
    if (authenticationTokenKeyManager != null) {
        authenticationTokenKeyManager.gracefulStop();
        authenticationTokenKeyManager.join(remaining(deadline));
    }
    // don't stop
    for (TabletGroupWatcher watcher : watchers) {
        watcher.join(remaining(deadline));
    }
    log.info("exiting");
}
Also used : Processor(org.apache.accumulo.core.master.thrift.MasterClientService.Processor) ServerAddress(org.apache.accumulo.server.rpc.ServerAddress) Watcher(org.apache.zookeeper.Watcher) MasterReplicationCoordinator(org.apache.accumulo.master.replication.MasterReplicationCoordinator) ReplicationCoordinator(org.apache.accumulo.core.replication.thrift.ReplicationCoordinator) WatchedEvent(org.apache.zookeeper.WatchedEvent) Iface(org.apache.accumulo.core.master.thrift.MasterClientService.Iface) Metrics(org.apache.accumulo.server.metrics.Metrics) RootTabletStateStore(org.apache.accumulo.server.master.state.RootTabletStateStore) MasterReplicationCoordinator(org.apache.accumulo.master.replication.MasterReplicationCoordinator) WorkDriver(org.apache.accumulo.master.replication.WorkDriver) AccumuloSecurityException(org.apache.accumulo.core.client.AccumuloSecurityException) AgeOffStore(org.apache.accumulo.fate.AgeOffStore) AccumuloException(org.apache.accumulo.core.client.AccumuloException) ZooReaderWriter(org.apache.accumulo.server.zookeeper.ZooReaderWriter) IZooReaderWriter(org.apache.accumulo.fate.zookeeper.IZooReaderWriter) ZooStore(org.apache.accumulo.server.master.state.ZooStore) IOException(java.io.IOException) TableNotFoundException(org.apache.accumulo.core.client.TableNotFoundException) NoAuthException(org.apache.zookeeper.KeeperException.NoAuthException) WalMarkerException(org.apache.accumulo.server.log.WalStateManager.WalMarkerException) TException(org.apache.thrift.TException) IOException(java.io.IOException) ThriftTableOperationException(org.apache.accumulo.core.client.impl.thrift.ThriftTableOperationException) TTransportException(org.apache.thrift.transport.TTransportException) AccumuloSecurityException(org.apache.accumulo.core.client.AccumuloSecurityException) KeeperException(org.apache.zookeeper.KeeperException) AccumuloException(org.apache.accumulo.core.client.AccumuloException) RecoveryManager(org.apache.accumulo.master.recovery.RecoveryManager) MetaDataStateStore(org.apache.accumulo.server.master.state.MetaDataStateStore) ReplicationDriver(org.apache.accumulo.master.replication.ReplicationDriver) MasterMetricsFactory(org.apache.accumulo.master.metrics.MasterMetricsFactory) ZooTabletStateStore(org.apache.accumulo.server.master.state.ZooTabletStateStore) KeeperException(org.apache.zookeeper.KeeperException)

Example 2 with Metrics

use of org.apache.accumulo.server.metrics.Metrics in project accumulo by apache.

the class Tablet method minorCompact.

DataFileValue minorCompact(VolumeManager fs, InMemoryMap memTable, FileRef tmpDatafile, FileRef newDatafile, FileRef mergeFile, boolean hasQueueTime, long queued, CommitSession commitSession, long flushId, MinorCompactionReason mincReason) {
    boolean failed = false;
    long start = System.currentTimeMillis();
    timer.incrementStatusMinor();
    long count = 0;
    String oldName = Thread.currentThread().getName();
    try {
        Thread.currentThread().setName("Minor compacting " + this.extent);
        Span span = Trace.start("write");
        CompactionStats stats;
        try {
            count = memTable.getNumEntries();
            DataFileValue dfv = null;
            if (mergeFile != null)
                dfv = getDatafileManager().getDatafileSizes().get(mergeFile);
            MinorCompactor compactor = new MinorCompactor(tabletServer, this, memTable, mergeFile, dfv, tmpDatafile, mincReason, tableConfiguration);
            stats = compactor.call();
        } finally {
            span.stop();
        }
        span = Trace.start("bringOnline");
        try {
            getDatafileManager().bringMinorCompactionOnline(tmpDatafile, newDatafile, mergeFile, new DataFileValue(stats.getFileSize(), stats.getEntriesWritten()), commitSession, flushId);
        } finally {
            span.stop();
        }
        return new DataFileValue(stats.getFileSize(), stats.getEntriesWritten());
    } catch (Exception | Error e) {
        failed = true;
        throw new RuntimeException(e);
    } finally {
        Thread.currentThread().setName(oldName);
        try {
            getTabletMemory().finalizeMinC();
        } catch (Throwable t) {
            log.error("Failed to free tablet memory", t);
        }
        if (!failed) {
            lastMinorCompactionFinishTime = System.currentTimeMillis();
        }
        Metrics minCMetrics = getTabletServer().getMinCMetrics();
        if (minCMetrics.isEnabled())
            minCMetrics.add(TabletServerMinCMetrics.MINC, (lastMinorCompactionFinishTime - start));
        if (hasQueueTime) {
            timer.updateTime(Operation.MINOR, queued, start, count, failed);
            if (minCMetrics.isEnabled())
                minCMetrics.add(TabletServerMinCMetrics.QUEUE, (start - queued));
        } else
            timer.updateTime(Operation.MINOR, start, count, failed);
    }
}
Also used : DataFileValue(org.apache.accumulo.core.metadata.schema.DataFileValue) Span(org.apache.accumulo.core.trace.Span) DecoderException(org.apache.commons.codec.DecoderException) IterationInterruptedException(org.apache.accumulo.core.iterators.IterationInterruptedException) IOException(java.io.IOException) NoNodeException(org.apache.zookeeper.KeeperException.NoNodeException) TConstraintViolationException(org.apache.accumulo.tserver.TConstraintViolationException) FileNotFoundException(java.io.FileNotFoundException) CompactionCanceledException(org.apache.accumulo.tserver.tablet.Compactor.CompactionCanceledException) TooManyFilesException(org.apache.accumulo.tserver.TooManyFilesException) KeeperException(org.apache.zookeeper.KeeperException) Metrics(org.apache.accumulo.server.metrics.Metrics) TabletServerScanMetrics(org.apache.accumulo.tserver.metrics.TabletServerScanMetrics) TabletServerMinCMetrics(org.apache.accumulo.tserver.metrics.TabletServerMinCMetrics)

Example 3 with Metrics

use of org.apache.accumulo.server.metrics.Metrics in project accumulo by apache.

the class Tablet method nextBatch.

Batch nextBatch(SortedKeyValueIterator<Key, Value> iter, Range range, int num, Set<Column> columns, long batchTimeOut, boolean isolated) throws IOException {
    // log.info("In nextBatch..");
    long stopTime = System.nanoTime() + TimeUnit.MILLISECONDS.toNanos(batchTimeOut);
    if (batchTimeOut == Long.MAX_VALUE || batchTimeOut <= 0) {
        batchTimeOut = 0;
    }
    List<KVEntry> results = new ArrayList<>();
    Key key = null;
    Value value;
    long resultSize = 0L;
    long resultBytes = 0L;
    long maxResultsSize = tableConfiguration.getAsBytes(Property.TABLE_SCAN_MAXMEM);
    Key continueKey = null;
    boolean skipContinueKey = false;
    YieldCallback<Key> yield = new YieldCallback<>();
    // we cannot yield if we are in isolation mode
    if (!isolated) {
        iter.enableYielding(yield);
    }
    if (columns.size() == 0) {
        iter.seek(range, LocalityGroupUtil.EMPTY_CF_SET, false);
    } else {
        iter.seek(range, LocalityGroupUtil.families(columns), true);
    }
    while (iter.hasTop()) {
        if (yield.hasYielded()) {
            throw new IOException("Coding error: hasTop returned true but has yielded at " + yield.getPositionAndReset());
        }
        value = iter.getTopValue();
        key = iter.getTopKey();
        // copies key and value
        KVEntry kvEntry = new KVEntry(key, value);
        results.add(kvEntry);
        resultSize += kvEntry.estimateMemoryUsed();
        resultBytes += kvEntry.numBytes();
        boolean timesUp = batchTimeOut > 0 && System.nanoTime() >= stopTime;
        if (resultSize >= maxResultsSize || results.size() >= num || timesUp) {
            continueKey = new Key(key);
            skipContinueKey = true;
            break;
        }
        iter.next();
    }
    if (yield.hasYielded()) {
        continueKey = new Key(yield.getPositionAndReset());
        skipContinueKey = true;
        if (!range.contains(continueKey)) {
            throw new IOException("Underlying iterator yielded to a position outside of its range: " + continueKey + " not in " + range);
        }
        if (!results.isEmpty() && continueKey.compareTo(results.get(results.size() - 1).getKey()) <= 0) {
            throw new IOException("Underlying iterator yielded to a position that does not follow the last key returned: " + continueKey + " <= " + results.get(results.size() - 1).getKey());
        }
        log.debug("Scan yield detected at position " + continueKey);
        Metrics scanMetrics = getTabletServer().getScanMetrics();
        if (scanMetrics.isEnabled())
            scanMetrics.add(TabletServerScanMetrics.YIELD, 1);
    } else if (!iter.hasTop()) {
        // end of tablet has been reached
        continueKey = null;
        if (results.size() == 0)
            results = null;
    }
    return new Batch(skipContinueKey, results, continueKey, resultBytes);
}
Also used : YieldCallback(org.apache.accumulo.core.iterators.YieldCallback) Metrics(org.apache.accumulo.server.metrics.Metrics) TabletServerScanMetrics(org.apache.accumulo.tserver.metrics.TabletServerScanMetrics) TabletServerMinCMetrics(org.apache.accumulo.tserver.metrics.TabletServerMinCMetrics) CopyOnWriteArrayList(java.util.concurrent.CopyOnWriteArrayList) ArrayList(java.util.ArrayList) Value(org.apache.accumulo.core.data.Value) DataFileValue(org.apache.accumulo.core.metadata.schema.DataFileValue) IOException(java.io.IOException) Key(org.apache.accumulo.core.data.Key)

Example 4 with Metrics

use of org.apache.accumulo.server.metrics.Metrics in project accumulo by apache.

the class Tablet method lookup.

private LookupResult lookup(SortedKeyValueIterator<Key, Value> mmfi, List<Range> ranges, HashSet<Column> columnSet, List<KVEntry> results, long maxResultsSize, long batchTimeOut) throws IOException {
    LookupResult lookupResult = new LookupResult();
    boolean exceededMemoryUsage = false;
    boolean tabletClosed = false;
    Set<ByteSequence> cfset = null;
    if (columnSet.size() > 0)
        cfset = LocalityGroupUtil.families(columnSet);
    long returnTime = System.nanoTime() + TimeUnit.MILLISECONDS.toNanos(batchTimeOut);
    if (batchTimeOut <= 0 || batchTimeOut == Long.MAX_VALUE) {
        batchTimeOut = 0;
    }
    // determine if the iterator supported yielding
    YieldCallback<Key> yield = new YieldCallback<>();
    mmfi.enableYielding(yield);
    boolean yielded = false;
    for (Range range : ranges) {
        boolean timesUp = batchTimeOut > 0 && System.nanoTime() > returnTime;
        if (exceededMemoryUsage || tabletClosed || timesUp || yielded) {
            lookupResult.unfinishedRanges.add(range);
            continue;
        }
        int entriesAdded = 0;
        try {
            if (cfset != null)
                mmfi.seek(range, cfset, true);
            else
                mmfi.seek(range, LocalityGroupUtil.EMPTY_CF_SET, false);
            while (mmfi.hasTop()) {
                if (yield.hasYielded()) {
                    throw new IOException("Coding error: hasTop returned true but has yielded at " + yield.getPositionAndReset());
                }
                Key key = mmfi.getTopKey();
                KVEntry kve = new KVEntry(key, mmfi.getTopValue());
                results.add(kve);
                entriesAdded++;
                lookupResult.bytesAdded += kve.estimateMemoryUsed();
                lookupResult.dataSize += kve.numBytes();
                exceededMemoryUsage = lookupResult.bytesAdded > maxResultsSize;
                timesUp = batchTimeOut > 0 && System.nanoTime() > returnTime;
                if (exceededMemoryUsage || timesUp) {
                    addUnfinishedRange(lookupResult, range, key, false);
                    break;
                }
                mmfi.next();
            }
            if (yield.hasYielded()) {
                yielded = true;
                Key yieldPosition = yield.getPositionAndReset();
                if (!range.contains(yieldPosition)) {
                    throw new IOException("Underlying iterator yielded to a position outside of its range: " + yieldPosition + " not in " + range);
                }
                if (!results.isEmpty() && yieldPosition.compareTo(results.get(results.size() - 1).getKey()) <= 0) {
                    throw new IOException("Underlying iterator yielded to a position that does not follow the last key returned: " + yieldPosition + " <= " + results.get(results.size() - 1).getKey());
                }
                addUnfinishedRange(lookupResult, range, yieldPosition, false);
                log.debug("Scan yield detected at position " + yieldPosition);
                Metrics scanMetrics = getTabletServer().getScanMetrics();
                if (scanMetrics.isEnabled())
                    scanMetrics.add(TabletServerScanMetrics.YIELD, 1);
            }
        } catch (TooManyFilesException tmfe) {
            // treat this as a closed tablet, and let the client retry
            log.warn("Tablet {} has too many files, batch lookup can not run", getExtent());
            handleTabletClosedDuringScan(results, lookupResult, exceededMemoryUsage, range, entriesAdded);
            tabletClosed = true;
        } catch (IOException ioe) {
            if (shutdownInProgress()) {
                // assume HDFS shutdown hook caused this exception
                log.debug("IOException while shutdown in progress", ioe);
                handleTabletClosedDuringScan(results, lookupResult, exceededMemoryUsage, range, entriesAdded);
                tabletClosed = true;
            } else {
                throw ioe;
            }
        } catch (IterationInterruptedException iie) {
            if (isClosed()) {
                handleTabletClosedDuringScan(results, lookupResult, exceededMemoryUsage, range, entriesAdded);
                tabletClosed = true;
            } else {
                throw iie;
            }
        } catch (TabletClosedException tce) {
            handleTabletClosedDuringScan(results, lookupResult, exceededMemoryUsage, range, entriesAdded);
            tabletClosed = true;
        }
    }
    return lookupResult;
}
Also used : YieldCallback(org.apache.accumulo.core.iterators.YieldCallback) TooManyFilesException(org.apache.accumulo.tserver.TooManyFilesException) IOException(java.io.IOException) Range(org.apache.accumulo.core.data.Range) Metrics(org.apache.accumulo.server.metrics.Metrics) TabletServerScanMetrics(org.apache.accumulo.tserver.metrics.TabletServerScanMetrics) TabletServerMinCMetrics(org.apache.accumulo.tserver.metrics.TabletServerMinCMetrics) IterationInterruptedException(org.apache.accumulo.core.iterators.IterationInterruptedException) ByteSequence(org.apache.accumulo.core.data.ByteSequence) Key(org.apache.accumulo.core.data.Key)

Example 5 with Metrics

use of org.apache.accumulo.server.metrics.Metrics in project accumulo by apache.

the class TabletServer method run.

// main loop listens for client requests
@Override
public void run() {
    SecurityUtil.serverLogin(SiteConfiguration.getInstance());
    // We can just make the zookeeper paths before we try to use.
    try {
        ZooKeeperInitialization.ensureZooKeeperInitialized(ZooReaderWriter.getInstance(), ZooUtil.getRoot(getInstance()));
    } catch (KeeperException | InterruptedException e) {
        log.error("Could not ensure that ZooKeeper is properly initialized", e);
        throw new RuntimeException(e);
    }
    Metrics tserverMetrics = metricsFactory.createTabletServerMetrics(this);
    // Register MBeans
    try {
        tserverMetrics.register();
        mincMetrics.register();
        scanMetrics.register();
        updateMetrics.register();
    } catch (Exception e) {
        log.error("Error registering with JMX", e);
    }
    if (null != authKeyWatcher) {
        log.info("Seeding ZooKeeper watcher for authentication keys");
        try {
            authKeyWatcher.updateAuthKeys();
        } catch (KeeperException | InterruptedException e) {
            // TODO Does there need to be a better check? What are the error conditions that we'd fall out here? AUTH_FAILURE?
            // If we get the error, do we just put it on a timer and retry the exists(String, Watcher) call?
            log.error("Failed to perform initial check for authentication tokens in ZooKeeper. Delegation token authentication will be unavailable.", e);
        }
    }
    try {
        clientAddress = startTabletClientService();
    } catch (UnknownHostException e1) {
        throw new RuntimeException("Failed to start the tablet client service", e1);
    }
    announceExistence();
    try {
        walMarker.initWalMarker(getTabletSession());
    } catch (Exception e) {
        log.error("Unable to create WAL marker node in zookeeper", e);
        throw new RuntimeException(e);
    }
    ThreadPoolExecutor distWorkQThreadPool = new SimpleThreadPool(getConfiguration().getCount(Property.TSERV_WORKQ_THREADS), "distributed work queue");
    bulkFailedCopyQ = new DistributedWorkQueue(ZooUtil.getRoot(getInstance()) + Constants.ZBULK_FAILED_COPYQ, getConfiguration());
    try {
        bulkFailedCopyQ.startProcessing(new BulkFailedCopyProcessor(), distWorkQThreadPool);
    } catch (Exception e1) {
        throw new RuntimeException("Failed to start distributed work queue for copying ", e1);
    }
    try {
        logSorter.startWatchingForRecoveryLogs(distWorkQThreadPool);
    } catch (Exception ex) {
        log.error("Error setting watches for recoveries");
        throw new RuntimeException(ex);
    }
    // Start the thrift service listening for incoming replication requests
    try {
        replicationAddress = startReplicationService();
    } catch (UnknownHostException e) {
        throw new RuntimeException("Failed to start replication service", e);
    }
    // Start the pool to handle outgoing replications
    final ThreadPoolExecutor replicationThreadPool = new SimpleThreadPool(getConfiguration().getCount(Property.REPLICATION_WORKER_THREADS), "replication task");
    replWorker.setExecutor(replicationThreadPool);
    replWorker.run();
    // Check the configuration value for the size of the pool and, if changed, resize the pool, every 5 seconds);
    final AccumuloConfiguration aconf = getConfiguration();
    Runnable replicationWorkThreadPoolResizer = new Runnable() {

        @Override
        public void run() {
            int maxPoolSize = aconf.getCount(Property.REPLICATION_WORKER_THREADS);
            if (replicationThreadPool.getMaximumPoolSize() != maxPoolSize) {
                log.info("Resizing thread pool for sending replication work from {} to {}", replicationThreadPool.getMaximumPoolSize(), maxPoolSize);
                replicationThreadPool.setMaximumPoolSize(maxPoolSize);
            }
        }
    };
    SimpleTimer.getInstance(aconf).schedule(replicationWorkThreadPoolResizer, 10000, 30000);
    final long CLEANUP_BULK_LOADED_CACHE_MILLIS = 15 * 60 * 1000;
    SimpleTimer.getInstance(aconf).schedule(new BulkImportCacheCleaner(this), CLEANUP_BULK_LOADED_CACHE_MILLIS, CLEANUP_BULK_LOADED_CACHE_MILLIS);
    HostAndPort masterHost;
    while (!serverStopRequested) {
        // send all of the pending messages
        try {
            MasterMessage mm = null;
            MasterClientService.Client iface = null;
            try {
                // was requested
                while (mm == null && !serverStopRequested) {
                    mm = masterMessages.poll(1000, TimeUnit.MILLISECONDS);
                }
                // have a message to send to the master, so grab a
                // connection
                masterHost = getMasterAddress();
                iface = masterConnection(masterHost);
                TServiceClient client = iface;
                // then finally block should place mm back on queue
                while (!serverStopRequested && mm != null && client != null && client.getOutputProtocol() != null && client.getOutputProtocol().getTransport() != null && client.getOutputProtocol().getTransport().isOpen()) {
                    try {
                        mm.send(rpcCreds(), getClientAddressString(), iface);
                        mm = null;
                    } catch (TException ex) {
                        log.warn("Error sending message: queuing message again");
                        masterMessages.putFirst(mm);
                        mm = null;
                        throw ex;
                    }
                    // if any messages are immediately available grab em and
                    // send them
                    mm = masterMessages.poll();
                }
            } finally {
                if (mm != null) {
                    masterMessages.putFirst(mm);
                }
                returnMasterConnection(iface);
                sleepUninterruptibly(1, TimeUnit.SECONDS);
            }
        } catch (InterruptedException e) {
            log.info("Interrupt Exception received, shutting down");
            serverStopRequested = true;
        } catch (Exception e) {
            // may have lost connection with master
            // loop back to the beginning and wait for a new one
            // this way we survive master failures
            log.error(getClientAddressString() + ": TServerInfo: Exception. Master down?", e);
        }
    }
    // get prematurely finalized
    synchronized (this) {
        while (!shutdownComplete) {
            try {
                this.wait(1000);
            } catch (InterruptedException e) {
                log.error(e.toString());
            }
        }
    }
    log.debug("Stopping Replication Server");
    TServerUtils.stopTServer(this.replServer);
    log.debug("Stopping Thrift Servers");
    TServerUtils.stopTServer(server);
    try {
        log.debug("Closing filesystem");
        fs.close();
    } catch (IOException e) {
        log.warn("Failed to close filesystem : {}", e.getMessage(), e);
    }
    gcLogger.logGCInfo(getConfiguration());
    log.info("TServerInfo: stop requested. exiting ... ");
    try {
        tabletServerLock.unlock();
    } catch (Exception e) {
        log.warn("Failed to release tablet server lock", e);
    }
}
Also used : MasterMessage(org.apache.accumulo.tserver.mastermessage.MasterMessage) TException(org.apache.thrift.TException) UnknownHostException(java.net.UnknownHostException) IOException(java.io.IOException) IterationInterruptedException(org.apache.accumulo.core.iterators.IterationInterruptedException) TableNotFoundException(org.apache.accumulo.core.client.TableNotFoundException) ThriftSecurityException(org.apache.accumulo.core.client.impl.thrift.ThriftSecurityException) IterationInterruptedException(org.apache.accumulo.core.iterators.IterationInterruptedException) TSampleNotPresentException(org.apache.accumulo.core.tabletserver.thrift.TSampleNotPresentException) WalMarkerException(org.apache.accumulo.server.log.WalStateManager.WalMarkerException) ConstraintViolationException(org.apache.accumulo.core.tabletserver.thrift.ConstraintViolationException) IOException(java.io.IOException) UnknownHostException(java.net.UnknownHostException) ExecutionException(java.util.concurrent.ExecutionException) NotServingTabletException(org.apache.accumulo.core.tabletserver.thrift.NotServingTabletException) AccumuloSecurityException(org.apache.accumulo.core.client.AccumuloSecurityException) KeeperException(org.apache.zookeeper.KeeperException) NoSuchScanIDException(org.apache.accumulo.core.tabletserver.thrift.NoSuchScanIDException) CancellationException(java.util.concurrent.CancellationException) DistributedStoreException(org.apache.accumulo.server.master.state.DistributedStoreException) TException(org.apache.thrift.TException) NoNodeException(org.apache.zookeeper.KeeperException.NoNodeException) ThriftTableOperationException(org.apache.accumulo.core.client.impl.thrift.ThriftTableOperationException) BadLocationStateException(org.apache.accumulo.server.master.state.TabletLocationState.BadLocationStateException) TimeoutException(java.util.concurrent.TimeoutException) TabletClosedException(org.apache.accumulo.tserver.tablet.TabletClosedException) SampleNotPresentException(org.apache.accumulo.core.client.SampleNotPresentException) AccumuloException(org.apache.accumulo.core.client.AccumuloException) DistributedWorkQueue(org.apache.accumulo.server.zookeeper.DistributedWorkQueue) TServiceClient(org.apache.thrift.TServiceClient) HostAndPort(org.apache.accumulo.core.util.HostAndPort) TabletServerScanMetrics(org.apache.accumulo.tserver.metrics.TabletServerScanMetrics) Metrics(org.apache.accumulo.server.metrics.Metrics) TabletServerUpdateMetrics(org.apache.accumulo.tserver.metrics.TabletServerUpdateMetrics) BulkImportCacheCleaner(org.apache.accumulo.tserver.tablet.BulkImportCacheCleaner) LoggingRunnable(org.apache.accumulo.fate.util.LoggingRunnable) MasterClientService(org.apache.accumulo.core.master.thrift.MasterClientService) ThreadPoolExecutor(java.util.concurrent.ThreadPoolExecutor) KeeperException(org.apache.zookeeper.KeeperException) SimpleThreadPool(org.apache.accumulo.core.util.SimpleThreadPool) AccumuloConfiguration(org.apache.accumulo.core.conf.AccumuloConfiguration)

Aggregations

IOException (java.io.IOException)5 Metrics (org.apache.accumulo.server.metrics.Metrics)5 TabletServerScanMetrics (org.apache.accumulo.tserver.metrics.TabletServerScanMetrics)4 IterationInterruptedException (org.apache.accumulo.core.iterators.IterationInterruptedException)3 TabletServerMinCMetrics (org.apache.accumulo.tserver.metrics.TabletServerMinCMetrics)3 KeeperException (org.apache.zookeeper.KeeperException)3 AccumuloException (org.apache.accumulo.core.client.AccumuloException)2 AccumuloSecurityException (org.apache.accumulo.core.client.AccumuloSecurityException)2 TableNotFoundException (org.apache.accumulo.core.client.TableNotFoundException)2 ThriftTableOperationException (org.apache.accumulo.core.client.impl.thrift.ThriftTableOperationException)2 Key (org.apache.accumulo.core.data.Key)2 YieldCallback (org.apache.accumulo.core.iterators.YieldCallback)2 DataFileValue (org.apache.accumulo.core.metadata.schema.DataFileValue)2 WalMarkerException (org.apache.accumulo.server.log.WalStateManager.WalMarkerException)2 TooManyFilesException (org.apache.accumulo.tserver.TooManyFilesException)2 NoNodeException (org.apache.zookeeper.KeeperException.NoNodeException)2 FileNotFoundException (java.io.FileNotFoundException)1 UnknownHostException (java.net.UnknownHostException)1 ArrayList (java.util.ArrayList)1 CancellationException (java.util.concurrent.CancellationException)1