use of org.apache.accumulo.tserver.metrics.TabletServerMetrics in project accumulo by apache.
the class TabletServer method run.
// main loop listens for client requests
@Override
public void run() {
SecurityUtil.serverLogin(getConfiguration());
// To make things easier on users/devs, and to avoid creating an upgrade path to 1.7
// We can just make the zookeeper paths before we try to use.
initializeZkForReplication();
if (authKeyWatcher != null) {
log.info("Seeding ZooKeeper watcher for authentication keys");
try {
authKeyWatcher.updateAuthKeys();
} catch (KeeperException | InterruptedException e) {
// TODO Does there need to be a better check? What are the error conditions that we'd fall
// out here? AUTH_FAILURE?
// If we get the error, do we just put it on a timer and retry the exists(String, Watcher)
// call?
log.error("Failed to perform initial check for authentication tokens in" + " ZooKeeper. Delegation token authentication will be unavailable.", e);
}
}
try {
clientAddress = startTabletClientService();
} catch (UnknownHostException e1) {
throw new RuntimeException("Failed to start the tablet client service", e1);
}
announceExistence();
try {
MetricsUtil.initializeMetrics(context.getConfiguration(), this.applicationName, clientAddress);
} catch (Exception e1) {
log.error("Error initializing metrics, metrics will not be emitted.", e1);
}
metrics = new TabletServerMetrics(this);
updateMetrics = new TabletServerUpdateMetrics();
scanMetrics = new TabletServerScanMetrics();
mincMetrics = new TabletServerMinCMetrics();
ceMetrics = new CompactionExecutorsMetrics();
MetricsUtil.initializeProducers(metrics, updateMetrics, scanMetrics, mincMetrics, ceMetrics);
this.compactionManager = new CompactionManager(new Iterable<Compactable>() {
@Override
public Iterator<Compactable> iterator() {
return Iterators.transform(onlineTablets.snapshot().values().iterator(), Tablet::asCompactable);
}
}, getContext(), ceMetrics);
compactionManager.start();
try {
walMarker.initWalMarker(getTabletSession());
} catch (Exception e) {
log.error("Unable to create WAL marker node in zookeeper", e);
throw new RuntimeException(e);
}
ThreadPoolExecutor distWorkQThreadPool = ThreadPools.createExecutorService(getConfiguration(), Property.TSERV_WORKQ_THREADS, true);
bulkFailedCopyQ = new DistributedWorkQueue(getContext().getZooKeeperRoot() + Constants.ZBULK_FAILED_COPYQ, getConfiguration(), getContext());
try {
bulkFailedCopyQ.startProcessing(new BulkFailedCopyProcessor(getContext()), distWorkQThreadPool);
} catch (Exception e1) {
throw new RuntimeException("Failed to start distributed work queue for copying ", e1);
}
try {
logSorter.startWatchingForRecoveryLogs(distWorkQThreadPool);
} catch (Exception ex) {
log.error("Error setting watches for recoveries");
throw new RuntimeException(ex);
}
final AccumuloConfiguration aconf = getConfiguration();
// if the replication name is ever set, then start replication services
@SuppressWarnings("deprecation") Property p = Property.REPLICATION_NAME;
context.getScheduledExecutor().scheduleWithFixedDelay(() -> {
if (this.replServer == null) {
if (!getConfiguration().get(p).isEmpty()) {
log.info(p.getKey() + " was set, starting repl services.");
setupReplication(aconf);
}
}
}, 0, 5, TimeUnit.SECONDS);
// random 30-60 minute delay
int tabletCheckFrequency = 30 + random.nextInt(31);
// Periodically check that metadata of tablets matches what is held in memory
ThreadPools.createGeneralScheduledExecutorService(aconf).scheduleWithFixedDelay(() -> {
final SortedMap<KeyExtent, Tablet> onlineTabletsSnapshot = onlineTablets.snapshot();
Map<KeyExtent, Long> updateCounts = new HashMap<>();
// gather updateCounts for each tablet
onlineTabletsSnapshot.forEach((ke, tablet) -> {
updateCounts.put(ke, tablet.getUpdateCount());
});
// gather metadata for all tablets readTablets()
try (TabletsMetadata tabletsMetadata = getContext().getAmple().readTablets().forTablets(onlineTabletsSnapshot.keySet()).fetch(FILES, LOGS, ECOMP, PREV_ROW).build()) {
// for each tablet, compare its metadata to what is held in memory
tabletsMetadata.forEach(tabletMetadata -> {
KeyExtent extent = tabletMetadata.getExtent();
Tablet tablet = onlineTabletsSnapshot.get(extent);
Long counter = updateCounts.get(extent);
tablet.compareTabletInfo(counter, tabletMetadata);
});
}
}, tabletCheckFrequency, tabletCheckFrequency, TimeUnit.MINUTES);
final long CLEANUP_BULK_LOADED_CACHE_MILLIS = TimeUnit.MINUTES.toMillis(15);
context.getScheduledExecutor().scheduleWithFixedDelay(new BulkImportCacheCleaner(this), CLEANUP_BULK_LOADED_CACHE_MILLIS, CLEANUP_BULK_LOADED_CACHE_MILLIS, TimeUnit.MILLISECONDS);
HostAndPort managerHost;
while (!serverStopRequested) {
// send all of the pending messages
try {
ManagerMessage mm = null;
ManagerClientService.Client iface = null;
try {
// was requested
while (mm == null && !serverStopRequested) {
mm = managerMessages.poll(1, TimeUnit.SECONDS);
}
// have a message to send to the manager, so grab a
// connection
managerHost = getManagerAddress();
iface = managerConnection(managerHost);
TServiceClient client = iface;
// then finally block should place mm back on queue
while (!serverStopRequested && mm != null && client != null && client.getOutputProtocol() != null && client.getOutputProtocol().getTransport() != null && client.getOutputProtocol().getTransport().isOpen()) {
try {
mm.send(getContext().rpcCreds(), getClientAddressString(), iface);
mm = null;
} catch (TException ex) {
log.warn("Error sending message: queuing message again");
managerMessages.putFirst(mm);
mm = null;
throw ex;
}
// if any messages are immediately available grab em and
// send them
mm = managerMessages.poll();
}
} finally {
if (mm != null) {
managerMessages.putFirst(mm);
}
returnManagerConnection(iface);
sleepUninterruptibly(1, TimeUnit.SECONDS);
}
} catch (InterruptedException e) {
log.info("Interrupt Exception received, shutting down");
serverStopRequested = true;
} catch (Exception e) {
// may have lost connection with manager
// loop back to the beginning and wait for a new one
// this way we survive manager failures
log.error(getClientAddressString() + ": TServerInfo: Exception. Manager down?", e);
}
}
// get prematurely finalized
synchronized (this) {
while (!shutdownComplete) {
try {
this.wait(1000);
} catch (InterruptedException e) {
log.error(e.toString());
}
}
}
log.debug("Stopping Replication Server");
if (this.replServer != null) {
this.replServer.stop();
}
log.debug("Stopping Thrift Servers");
if (server != null) {
server.stop();
}
try {
log.debug("Closing filesystems");
getVolumeManager().close();
} catch (IOException e) {
log.warn("Failed to close filesystem : {}", e.getMessage(), e);
}
gcLogger.logGCInfo(getConfiguration());
log.info("TServerInfo: stop requested. exiting ... ");
try {
tabletServerLock.unlock();
} catch (Exception e) {
log.warn("Failed to release tablet server lock", e);
}
}
Aggregations