use of org.apache.accumulo.core.util.HostAndPort in project accumulo by apache.
the class Monitor method fetchScans.
public static void fetchScans() throws Exception {
if (instance == null)
return;
Connector c = context.getConnector();
for (String server : c.instanceOperations().getTabletServers()) {
final HostAndPort parsedServer = HostAndPort.fromString(server);
Client tserver = ThriftUtil.getTServerClient(parsedServer, context);
try {
List<ActiveScan> scans = tserver.getActiveScans(null, context.rpcCreds());
synchronized (allScans) {
allScans.put(parsedServer, new ScanStats(scans));
}
} catch (Exception ex) {
log.debug("Failed to get active scans from {}", server, ex);
} finally {
ThriftUtil.returnClient(tserver);
}
}
// Age off old scan information
Iterator<Entry<HostAndPort, ScanStats>> entryIter = allScans.entrySet().iterator();
long now = System.currentTimeMillis();
while (entryIter.hasNext()) {
Entry<HostAndPort, ScanStats> entry = entryIter.next();
if (now - entry.getValue().fetched > 5 * 60 * 1000) {
entryIter.remove();
}
}
}
use of org.apache.accumulo.core.util.HostAndPort in project accumulo by apache.
the class ZooKeeperStatus method run.
@Override
public void run() {
while (!stop) {
TreeSet<ZooKeeperState> update = new TreeSet<>();
String[] zookeepers = SiteConfiguration.getInstance().get(Property.INSTANCE_ZK_HOST).split(",");
for (String keeper : zookeepers) {
int clients = 0;
String mode = "unknown";
String[] parts = keeper.split(":");
TTransport transport = null;
try {
HostAndPort addr;
if (parts.length > 1)
addr = HostAndPort.fromParts(parts[0], Integer.parseInt(parts[1]));
else
addr = HostAndPort.fromParts(parts[0], 2181);
transport = TTimeoutTransport.create(addr, 10 * 1000l);
transport.write("stat\n".getBytes(UTF_8), 0, 5);
StringBuilder response = new StringBuilder();
try {
transport.flush();
byte[] buffer = new byte[1024 * 100];
int n = 0;
while ((n = transport.read(buffer, 0, buffer.length)) > 0) {
response.append(new String(buffer, 0, n, UTF_8));
}
} catch (TTransportException ex) {
// happens at EOF
}
for (String line : response.toString().split("\n")) {
if (line.startsWith(" "))
clients++;
if (line.startsWith("Mode"))
mode = line.split(":")[1];
}
update.add(new ZooKeeperState(keeper, mode, clients));
} catch (Exception ex) {
log.info("Exception talking to zookeeper " + keeper, ex);
update.add(new ZooKeeperState(keeper, "Down", -1));
} finally {
if (transport != null) {
try {
transport.close();
} catch (Exception ex) {
log.error("Exception", ex);
}
}
}
}
status = update;
sleepUninterruptibly(5, TimeUnit.SECONDS);
}
}
use of org.apache.accumulo.core.util.HostAndPort in project accumulo by apache.
the class LoadFiles method call.
@Override
public Repo<Master> call(final long tid, final Master master) throws Exception {
master.updateBulkImportStatus(source, BulkImportState.LOADING);
ExecutorService executor = getThreadPool(master);
final AccumuloConfiguration conf = master.getConfiguration();
VolumeManager fs = master.getFileSystem();
List<FileStatus> files = new ArrayList<>();
for (FileStatus entry : fs.listStatus(new Path(bulk))) {
files.add(entry);
}
log.debug("tid " + tid + " importing " + files.size() + " files");
Path writable = new Path(this.errorDir, ".iswritable");
if (!fs.createNewFile(writable)) {
// Maybe this is a re-try... clear the flag and try again
fs.delete(writable);
if (!fs.createNewFile(writable))
throw new AcceptableThriftTableOperationException(tableId.canonicalID(), null, TableOperation.BULK_IMPORT, TableOperationExceptionType.BULK_BAD_ERROR_DIRECTORY, "Unable to write to " + this.errorDir);
}
fs.delete(writable);
final Set<String> filesToLoad = Collections.synchronizedSet(new HashSet<String>());
for (FileStatus f : files) filesToLoad.add(f.getPath().toString());
final int RETRIES = Math.max(1, conf.getCount(Property.MASTER_BULK_RETRIES));
for (int attempt = 0; attempt < RETRIES && filesToLoad.size() > 0; attempt++) {
List<Future<List<String>>> results = new ArrayList<>();
if (master.onlineTabletServers().size() == 0)
log.warn("There are no tablet server to process bulk import, waiting (tid = " + tid + ")");
while (master.onlineTabletServers().size() == 0) {
sleepUninterruptibly(500, TimeUnit.MILLISECONDS);
}
// Use the threadpool to assign files one-at-a-time to the server
final List<String> loaded = Collections.synchronizedList(new ArrayList<String>());
final Random random = new Random();
final TServerInstance[] servers;
String prop = conf.get(Property.MASTER_BULK_TSERVER_REGEX);
if (null == prop || "".equals(prop)) {
servers = master.onlineTabletServers().toArray(new TServerInstance[0]);
} else {
Pattern regex = Pattern.compile(prop);
List<TServerInstance> subset = new ArrayList<>();
master.onlineTabletServers().forEach(t -> {
if (regex.matcher(t.host()).matches()) {
subset.add(t);
}
});
if (0 == subset.size()) {
log.warn("There are no tablet servers online that match supplied regex: {}", conf.get(Property.MASTER_BULK_TSERVER_REGEX));
}
servers = subset.toArray(new TServerInstance[0]);
}
if (servers.length > 0) {
for (final String file : filesToLoad) {
results.add(executor.submit(new Callable<List<String>>() {
@Override
public List<String> call() {
List<String> failures = new ArrayList<>();
ClientService.Client client = null;
HostAndPort server = null;
try {
// get a connection to a random tablet server, do not prefer cached connections because
// this is running on the master and there are lots of connections to tablet servers
// serving the metadata tablets
long timeInMillis = master.getConfiguration().getTimeInMillis(Property.MASTER_BULK_TIMEOUT);
// Pair<String,Client> pair = ServerClient.getConnection(master, false, timeInMillis);
server = servers[random.nextInt(servers.length)].getLocation();
client = ThriftUtil.getTServerClient(server, master, timeInMillis);
List<String> attempt = Collections.singletonList(file);
log.debug("Asking " + server + " to bulk import " + file);
List<String> fail = client.bulkImportFiles(Tracer.traceInfo(), master.rpcCreds(), tid, tableId.canonicalID(), attempt, errorDir, setTime);
if (fail.isEmpty()) {
loaded.add(file);
} else {
failures.addAll(fail);
}
} catch (Exception ex) {
log.error("rpc failed server:" + server + ", tid:" + tid + " " + ex);
} finally {
ThriftUtil.returnClient(client);
}
return failures;
}
}));
}
}
Set<String> failures = new HashSet<>();
for (Future<List<String>> f : results) failures.addAll(f.get());
filesToLoad.removeAll(loaded);
if (filesToLoad.size() > 0) {
log.debug("tid " + tid + " attempt " + (attempt + 1) + " " + sampleList(filesToLoad, 10) + " failed");
sleepUninterruptibly(100, TimeUnit.MILLISECONDS);
}
}
FSDataOutputStream failFile = fs.create(new Path(errorDir, BulkImport.FAILURES_TXT), true);
try (BufferedWriter out = new BufferedWriter(new OutputStreamWriter(failFile, UTF_8))) {
for (String f : filesToLoad) {
out.write(f);
out.write("\n");
}
}
// return the next step, which will perform cleanup
return new CompleteBulkImport(tableId, source, bulk, errorDir);
}
use of org.apache.accumulo.core.util.HostAndPort in project accumulo by apache.
the class TabletServer method run.
// main loop listens for client requests
@Override
public void run() {
SecurityUtil.serverLogin(SiteConfiguration.getInstance());
// We can just make the zookeeper paths before we try to use.
try {
ZooKeeperInitialization.ensureZooKeeperInitialized(ZooReaderWriter.getInstance(), ZooUtil.getRoot(getInstance()));
} catch (KeeperException | InterruptedException e) {
log.error("Could not ensure that ZooKeeper is properly initialized", e);
throw new RuntimeException(e);
}
Metrics tserverMetrics = metricsFactory.createTabletServerMetrics(this);
// Register MBeans
try {
tserverMetrics.register();
mincMetrics.register();
scanMetrics.register();
updateMetrics.register();
} catch (Exception e) {
log.error("Error registering with JMX", e);
}
if (null != authKeyWatcher) {
log.info("Seeding ZooKeeper watcher for authentication keys");
try {
authKeyWatcher.updateAuthKeys();
} catch (KeeperException | InterruptedException e) {
// TODO Does there need to be a better check? What are the error conditions that we'd fall out here? AUTH_FAILURE?
// If we get the error, do we just put it on a timer and retry the exists(String, Watcher) call?
log.error("Failed to perform initial check for authentication tokens in ZooKeeper. Delegation token authentication will be unavailable.", e);
}
}
try {
clientAddress = startTabletClientService();
} catch (UnknownHostException e1) {
throw new RuntimeException("Failed to start the tablet client service", e1);
}
announceExistence();
try {
walMarker.initWalMarker(getTabletSession());
} catch (Exception e) {
log.error("Unable to create WAL marker node in zookeeper", e);
throw new RuntimeException(e);
}
ThreadPoolExecutor distWorkQThreadPool = new SimpleThreadPool(getConfiguration().getCount(Property.TSERV_WORKQ_THREADS), "distributed work queue");
bulkFailedCopyQ = new DistributedWorkQueue(ZooUtil.getRoot(getInstance()) + Constants.ZBULK_FAILED_COPYQ, getConfiguration());
try {
bulkFailedCopyQ.startProcessing(new BulkFailedCopyProcessor(), distWorkQThreadPool);
} catch (Exception e1) {
throw new RuntimeException("Failed to start distributed work queue for copying ", e1);
}
try {
logSorter.startWatchingForRecoveryLogs(distWorkQThreadPool);
} catch (Exception ex) {
log.error("Error setting watches for recoveries");
throw new RuntimeException(ex);
}
// Start the thrift service listening for incoming replication requests
try {
replicationAddress = startReplicationService();
} catch (UnknownHostException e) {
throw new RuntimeException("Failed to start replication service", e);
}
// Start the pool to handle outgoing replications
final ThreadPoolExecutor replicationThreadPool = new SimpleThreadPool(getConfiguration().getCount(Property.REPLICATION_WORKER_THREADS), "replication task");
replWorker.setExecutor(replicationThreadPool);
replWorker.run();
// Check the configuration value for the size of the pool and, if changed, resize the pool, every 5 seconds);
final AccumuloConfiguration aconf = getConfiguration();
Runnable replicationWorkThreadPoolResizer = new Runnable() {
@Override
public void run() {
int maxPoolSize = aconf.getCount(Property.REPLICATION_WORKER_THREADS);
if (replicationThreadPool.getMaximumPoolSize() != maxPoolSize) {
log.info("Resizing thread pool for sending replication work from {} to {}", replicationThreadPool.getMaximumPoolSize(), maxPoolSize);
replicationThreadPool.setMaximumPoolSize(maxPoolSize);
}
}
};
SimpleTimer.getInstance(aconf).schedule(replicationWorkThreadPoolResizer, 10000, 30000);
final long CLEANUP_BULK_LOADED_CACHE_MILLIS = 15 * 60 * 1000;
SimpleTimer.getInstance(aconf).schedule(new BulkImportCacheCleaner(this), CLEANUP_BULK_LOADED_CACHE_MILLIS, CLEANUP_BULK_LOADED_CACHE_MILLIS);
HostAndPort masterHost;
while (!serverStopRequested) {
// send all of the pending messages
try {
MasterMessage mm = null;
MasterClientService.Client iface = null;
try {
// was requested
while (mm == null && !serverStopRequested) {
mm = masterMessages.poll(1000, TimeUnit.MILLISECONDS);
}
// have a message to send to the master, so grab a
// connection
masterHost = getMasterAddress();
iface = masterConnection(masterHost);
TServiceClient client = iface;
// then finally block should place mm back on queue
while (!serverStopRequested && mm != null && client != null && client.getOutputProtocol() != null && client.getOutputProtocol().getTransport() != null && client.getOutputProtocol().getTransport().isOpen()) {
try {
mm.send(rpcCreds(), getClientAddressString(), iface);
mm = null;
} catch (TException ex) {
log.warn("Error sending message: queuing message again");
masterMessages.putFirst(mm);
mm = null;
throw ex;
}
// if any messages are immediately available grab em and
// send them
mm = masterMessages.poll();
}
} finally {
if (mm != null) {
masterMessages.putFirst(mm);
}
returnMasterConnection(iface);
sleepUninterruptibly(1, TimeUnit.SECONDS);
}
} catch (InterruptedException e) {
log.info("Interrupt Exception received, shutting down");
serverStopRequested = true;
} catch (Exception e) {
// may have lost connection with master
// loop back to the beginning and wait for a new one
// this way we survive master failures
log.error(getClientAddressString() + ": TServerInfo: Exception. Master down?", e);
}
}
// get prematurely finalized
synchronized (this) {
while (!shutdownComplete) {
try {
this.wait(1000);
} catch (InterruptedException e) {
log.error(e.toString());
}
}
}
log.debug("Stopping Replication Server");
TServerUtils.stopTServer(this.replServer);
log.debug("Stopping Thrift Servers");
TServerUtils.stopTServer(server);
try {
log.debug("Closing filesystem");
fs.close();
} catch (IOException e) {
log.warn("Failed to close filesystem : {}", e.getMessage(), e);
}
gcLogger.logGCInfo(getConfiguration());
log.info("TServerInfo: stop requested. exiting ... ");
try {
tabletServerLock.unlock();
} catch (Exception e) {
log.warn("Failed to release tablet server lock", e);
}
}
use of org.apache.accumulo.core.util.HostAndPort in project accumulo by apache.
the class TabletServerResource method getTserverDetails.
/**
* Generates details for the selected tserver
*
* @param tserverAddress
* TServer name
* @return TServer details
*/
@Path("{address}")
@GET
public TabletServerSummary getTserverDetails(@PathParam("address") @NotNull @Pattern(regexp = SERVER_REGEX) String tserverAddress) throws Exception {
boolean tserverExists = false;
for (TabletServerStatus ts : Monitor.getMmi().getTServerInfo()) {
if (tserverAddress.equals(ts.getName())) {
tserverExists = true;
break;
}
}
if (!tserverExists) {
return null;
}
double totalElapsedForAll = 0;
double splitStdDev = 0;
double minorStdDev = 0;
double minorQueueStdDev = 0;
double majorStdDev = 0;
double majorQueueStdDev = 0;
double currentMinorAvg = 0;
double currentMajorAvg = 0;
double currentMinorStdDev = 0;
double currentMajorStdDev = 0;
total = new TabletStats(null, new ActionStats(), new ActionStats(), new ActionStats(), 0, 0, 0, 0);
HostAndPort address = HostAndPort.fromString(tserverAddress);
historical = new TabletStats(null, new ActionStats(), new ActionStats(), new ActionStats(), 0, 0, 0, 0);
List<TabletStats> tsStats = new ArrayList<>();
try {
ClientContext context = Monitor.getContext();
TabletClientService.Client client = ThriftUtil.getClient(new TabletClientService.Client.Factory(), address, context);
try {
for (String tableId : Monitor.getMmi().tableMap.keySet()) {
tsStats.addAll(client.getTabletStats(Tracer.traceInfo(), context.rpcCreds(), tableId));
}
historical = client.getHistoricalStats(Tracer.traceInfo(), context.rpcCreds());
} finally {
ThriftUtil.returnClient(client);
}
} catch (Exception e) {
return null;
}
List<CurrentOperations> currentOps = doCurrentOperations(tsStats);
if (total.minors.num != 0)
currentMinorAvg = (long) (total.minors.elapsed / total.minors.num);
if (total.minors.elapsed != 0 && total.minors.num != 0)
currentMinorStdDev = stddev(total.minors.elapsed, total.minors.num, total.minors.sumDev);
if (total.majors.num != 0)
currentMajorAvg = total.majors.elapsed / total.majors.num;
if (total.majors.elapsed != 0 && total.majors.num != 0 && total.majors.elapsed > total.majors.num)
currentMajorStdDev = stddev(total.majors.elapsed, total.majors.num, total.majors.sumDev);
ActionStatsUpdator.update(total.minors, historical.minors);
ActionStatsUpdator.update(total.majors, historical.majors);
totalElapsedForAll += total.majors.elapsed + historical.splits.elapsed + total.minors.elapsed;
minorStdDev = stddev(total.minors.elapsed, total.minors.num, total.minors.sumDev);
minorQueueStdDev = stddev(total.minors.queueTime, total.minors.num, total.minors.queueSumDev);
majorStdDev = stddev(total.majors.elapsed, total.majors.num, total.majors.sumDev);
majorQueueStdDev = stddev(total.majors.queueTime, total.majors.num, total.majors.queueSumDev);
splitStdDev = stddev(historical.splits.num, historical.splits.elapsed, historical.splits.sumDev);
TabletServerDetailInformation details = doDetails(address, tsStats.size());
List<AllTimeTabletResults> allTime = doAllTimeResults(majorQueueStdDev, minorQueueStdDev, totalElapsedForAll, splitStdDev, majorStdDev, minorStdDev);
CurrentTabletResults currentRes = doCurrentTabletResults(currentMinorAvg, currentMinorStdDev, currentMajorAvg, currentMajorStdDev);
TabletServerSummary tserverDetails = new TabletServerSummary(details, allTime, currentRes, currentOps);
return tserverDetails;
}
Aggregations