use of org.apache.accumulo.tserver.mastermessage.MasterMessage in project accumulo by apache.
the class TabletServer method run.
// main loop listens for client requests
@Override
public void run() {
SecurityUtil.serverLogin(SiteConfiguration.getInstance());
// We can just make the zookeeper paths before we try to use.
try {
ZooKeeperInitialization.ensureZooKeeperInitialized(ZooReaderWriter.getInstance(), ZooUtil.getRoot(getInstance()));
} catch (KeeperException | InterruptedException e) {
log.error("Could not ensure that ZooKeeper is properly initialized", e);
throw new RuntimeException(e);
}
Metrics tserverMetrics = metricsFactory.createTabletServerMetrics(this);
// Register MBeans
try {
tserverMetrics.register();
mincMetrics.register();
scanMetrics.register();
updateMetrics.register();
} catch (Exception e) {
log.error("Error registering with JMX", e);
}
if (null != authKeyWatcher) {
log.info("Seeding ZooKeeper watcher for authentication keys");
try {
authKeyWatcher.updateAuthKeys();
} catch (KeeperException | InterruptedException e) {
// TODO Does there need to be a better check? What are the error conditions that we'd fall out here? AUTH_FAILURE?
// If we get the error, do we just put it on a timer and retry the exists(String, Watcher) call?
log.error("Failed to perform initial check for authentication tokens in ZooKeeper. Delegation token authentication will be unavailable.", e);
}
}
try {
clientAddress = startTabletClientService();
} catch (UnknownHostException e1) {
throw new RuntimeException("Failed to start the tablet client service", e1);
}
announceExistence();
try {
walMarker.initWalMarker(getTabletSession());
} catch (Exception e) {
log.error("Unable to create WAL marker node in zookeeper", e);
throw new RuntimeException(e);
}
ThreadPoolExecutor distWorkQThreadPool = new SimpleThreadPool(getConfiguration().getCount(Property.TSERV_WORKQ_THREADS), "distributed work queue");
bulkFailedCopyQ = new DistributedWorkQueue(ZooUtil.getRoot(getInstance()) + Constants.ZBULK_FAILED_COPYQ, getConfiguration());
try {
bulkFailedCopyQ.startProcessing(new BulkFailedCopyProcessor(), distWorkQThreadPool);
} catch (Exception e1) {
throw new RuntimeException("Failed to start distributed work queue for copying ", e1);
}
try {
logSorter.startWatchingForRecoveryLogs(distWorkQThreadPool);
} catch (Exception ex) {
log.error("Error setting watches for recoveries");
throw new RuntimeException(ex);
}
// Start the thrift service listening for incoming replication requests
try {
replicationAddress = startReplicationService();
} catch (UnknownHostException e) {
throw new RuntimeException("Failed to start replication service", e);
}
// Start the pool to handle outgoing replications
final ThreadPoolExecutor replicationThreadPool = new SimpleThreadPool(getConfiguration().getCount(Property.REPLICATION_WORKER_THREADS), "replication task");
replWorker.setExecutor(replicationThreadPool);
replWorker.run();
// Check the configuration value for the size of the pool and, if changed, resize the pool, every 5 seconds);
final AccumuloConfiguration aconf = getConfiguration();
Runnable replicationWorkThreadPoolResizer = new Runnable() {
@Override
public void run() {
int maxPoolSize = aconf.getCount(Property.REPLICATION_WORKER_THREADS);
if (replicationThreadPool.getMaximumPoolSize() != maxPoolSize) {
log.info("Resizing thread pool for sending replication work from {} to {}", replicationThreadPool.getMaximumPoolSize(), maxPoolSize);
replicationThreadPool.setMaximumPoolSize(maxPoolSize);
}
}
};
SimpleTimer.getInstance(aconf).schedule(replicationWorkThreadPoolResizer, 10000, 30000);
final long CLEANUP_BULK_LOADED_CACHE_MILLIS = 15 * 60 * 1000;
SimpleTimer.getInstance(aconf).schedule(new BulkImportCacheCleaner(this), CLEANUP_BULK_LOADED_CACHE_MILLIS, CLEANUP_BULK_LOADED_CACHE_MILLIS);
HostAndPort masterHost;
while (!serverStopRequested) {
// send all of the pending messages
try {
MasterMessage mm = null;
MasterClientService.Client iface = null;
try {
// was requested
while (mm == null && !serverStopRequested) {
mm = masterMessages.poll(1000, TimeUnit.MILLISECONDS);
}
// have a message to send to the master, so grab a
// connection
masterHost = getMasterAddress();
iface = masterConnection(masterHost);
TServiceClient client = iface;
// then finally block should place mm back on queue
while (!serverStopRequested && mm != null && client != null && client.getOutputProtocol() != null && client.getOutputProtocol().getTransport() != null && client.getOutputProtocol().getTransport().isOpen()) {
try {
mm.send(rpcCreds(), getClientAddressString(), iface);
mm = null;
} catch (TException ex) {
log.warn("Error sending message: queuing message again");
masterMessages.putFirst(mm);
mm = null;
throw ex;
}
// if any messages are immediately available grab em and
// send them
mm = masterMessages.poll();
}
} finally {
if (mm != null) {
masterMessages.putFirst(mm);
}
returnMasterConnection(iface);
sleepUninterruptibly(1, TimeUnit.SECONDS);
}
} catch (InterruptedException e) {
log.info("Interrupt Exception received, shutting down");
serverStopRequested = true;
} catch (Exception e) {
// may have lost connection with master
// loop back to the beginning and wait for a new one
// this way we survive master failures
log.error(getClientAddressString() + ": TServerInfo: Exception. Master down?", e);
}
}
// get prematurely finalized
synchronized (this) {
while (!shutdownComplete) {
try {
this.wait(1000);
} catch (InterruptedException e) {
log.error(e.toString());
}
}
}
log.debug("Stopping Replication Server");
TServerUtils.stopTServer(this.replServer);
log.debug("Stopping Thrift Servers");
TServerUtils.stopTServer(server);
try {
log.debug("Closing filesystem");
fs.close();
} catch (IOException e) {
log.warn("Failed to close filesystem : {}", e.getMessage(), e);
}
gcLogger.logGCInfo(getConfiguration());
log.info("TServerInfo: stop requested. exiting ... ");
try {
tabletServerLock.unlock();
} catch (Exception e) {
log.warn("Failed to release tablet server lock", e);
}
}
Aggregations