use of alluxio.job.wire.JobWorkerHealth in project alluxio by Alluxio.
the class LogLevelTest method parseJobWorkerTargets.
@Test
public void parseJobWorkerTargets() throws Exception {
CommandLine mockCommandLine = mock(CommandLine.class);
String[] mockArgs = new String[] { "--target", "job_workers" };
when(mockCommandLine.getArgs()).thenReturn(mockArgs);
when(mockCommandLine.hasOption(LogLevel.TARGET_OPTION_NAME)).thenReturn(true);
when(mockCommandLine.getOptionValue(LogLevel.TARGET_OPTION_NAME)).thenReturn(mockArgs[1]);
// Prepare a list of job workers
List<JobWorkerHealth> jobWorkers = new ArrayList<>();
jobWorkers.add(new JobWorkerHealth(0, new ArrayList<>(), 10, 0, 0, "workers-1"));
jobWorkers.add(new JobWorkerHealth(1, new ArrayList<>(), 10, 0, 0, "workers-2"));
PowerMockito.mockStatic(JobMasterClient.Factory.class);
JobMasterClient mockJobClient = mock(JobMasterClient.class);
when(mockJobClient.getAllWorkerHealth()).thenReturn(jobWorkers);
when(JobMasterClient.Factory.create(any())).thenReturn(mockJobClient);
List<LogLevel.TargetInfo> targets = LogLevel.parseOptTarget(mockCommandLine, mConf);
assertEquals(2, targets.size());
assertEquals(new LogLevel.TargetInfo("workers-1", JOB_WORKER_WEB_PORT, "job_worker"), targets.get(0));
assertEquals(new LogLevel.TargetInfo("workers-2", JOB_WORKER_WEB_PORT, "job_worker"), targets.get(1));
}
use of alluxio.job.wire.JobWorkerHealth in project alluxio by Alluxio.
the class JobServiceMetricsCommandTest method testBasic.
@Test
public void testBasic() throws IOException, ParseException {
JobWorkerHealth jobWorkerHealth = new JobWorkerHealth(1, Lists.newArrayList(1.2, 0.9, 0.7), 10, 2, 2, "testHost");
Mockito.when(mJobMasterClient.getAllWorkerHealth()).thenReturn(Lists.newArrayList(jobWorkerHealth));
List<JobInfo> jobInfos = new ArrayList<>();
jobInfos.add(createJobInfo(1, "Test1", Status.RUNNING, "2019-10-17 12:00:00"));
jobInfos.add(createJobInfo(2, "Test2", Status.FAILED, "2019-10-17 12:30:15"));
Mockito.when(mJobMasterClient.getJobServiceSummary()).thenReturn(new JobServiceSummary(jobInfos));
new JobServiceMetricsCommand(mJobMasterClient, mPrintStream, "MM-dd-yyyy HH:mm:ss:SSS").run();
String output = new String(mOutputStream.toByteArray(), StandardCharsets.UTF_8);
String[] lineByLine = output.split("\n");
// Worker Health Section
assertEquals("Worker: testHost Task Pool Size: 10 Unfinished Tasks: 2" + " Active Tasks: 2 Load Avg: 1.2, 0.9, 0.7", lineByLine[0]);
assertEquals("", lineByLine[1]);
// Group By Status
lineByLine = ArrayUtils.subarray(lineByLine, 2, lineByLine.length);
assertEquals("Status: CREATED Count: 0", lineByLine[0]);
assertEquals("Status: CANCELED Count: 0", lineByLine[1]);
assertEquals("Status: FAILED Count: 1", lineByLine[2]);
assertEquals("Status: RUNNING Count: 1", lineByLine[3]);
assertEquals("Status: COMPLETED Count: 0", lineByLine[4]);
assertEquals("", lineByLine[5]);
// Top 10
lineByLine = ArrayUtils.subarray(lineByLine, 6, lineByLine.length);
assertEquals("10 Most Recently Modified Jobs:", lineByLine[0]);
assertEquals("Timestamp: 01-17-2019 12:30:15:000 Id: 2 Name: Test2" + " Status: FAILED", lineByLine[1]);
assertEquals("Timestamp: 01-17-2019 12:00:00:000 Id: 1 Name: Test1" + " Status: RUNNING", lineByLine[2]);
assertEquals("", lineByLine[3]);
assertEquals("10 Most Recently Failed Jobs:", lineByLine[4]);
assertEquals("Timestamp: 01-17-2019 12:30:15:000 Id: 2 Name: Test2" + " Status: FAILED", lineByLine[5]);
assertEquals("", lineByLine[6]);
assertEquals("10 Longest Running Jobs:", lineByLine[7]);
assertEquals("Timestamp: 01-17-2019 12:00:00:000 Id: 1 Name: Test1" + " Status: RUNNING", lineByLine[8]);
}
use of alluxio.job.wire.JobWorkerHealth in project alluxio by Alluxio.
the class LogLevel method getTargetInfos.
private static List<TargetInfo> getTargetInfos(String[] targets, AlluxioConfiguration conf) throws IOException {
// Trim the elements
Set<String> targetSet = Arrays.stream(targets).map(String::trim).collect(Collectors.toSet());
List<TargetInfo> targetInfoList = new ArrayList<>();
// Allow plural form for the master/job_master and print a notice
if (targetSet.contains(ROLE_MASTERS)) {
System.out.println("The logLevel command will only take effect on the primary master, " + "instead of on all the masters. ");
targetSet.remove(ROLE_MASTERS);
targetSet.add(ROLE_MASTER);
System.out.println("Target `masters` is replaced with `master`.");
}
if (targetSet.contains(ROLE_JOB_MASTERS)) {
System.out.println("The logLevel command will only take effect on the primary job master, " + "instead of on all the masters. ");
targetSet.remove(ROLE_JOB_MASTERS);
targetSet.add(ROLE_JOB_MASTER);
System.out.println("Target `job_masters` is replaced with `job_master`.");
}
ClientContext clientContext = ClientContext.create(conf);
// Created only when needed by master and workers
FileSystemContext fsContext = null;
// Created only when needed by the job master and job workers
JobMasterClient jobClient = null;
// Process each target
for (String target : targetSet) {
if (target.isEmpty()) {
continue;
} else if (target.equals(ROLE_MASTER)) {
if (fsContext == null) {
fsContext = FileSystemContext.create(clientContext);
}
String masterHost = fsContext.getMasterAddress().getHostName();
int masterPort = NetworkAddressUtils.getPort(ServiceType.MASTER_WEB, conf);
TargetInfo master = new TargetInfo(masterHost, masterPort, ROLE_MASTER);
targetInfoList.add(master);
} else if (target.equals(ROLE_JOB_MASTER)) {
if (jobClient == null) {
jobClient = JobMasterClient.Factory.create(JobMasterClientContext.newBuilder(clientContext).build());
}
String jobMasterHost = jobClient.getAddress().getHostName();
int jobMasterPort = NetworkAddressUtils.getPort(ServiceType.JOB_MASTER_WEB, conf);
TargetInfo jobMaster = new TargetInfo(jobMasterHost, jobMasterPort, ROLE_JOB_MASTER);
targetInfoList.add(jobMaster);
} else if (target.equals(ROLE_WORKERS)) {
if (fsContext == null) {
fsContext = FileSystemContext.create(ClientContext.create(conf));
}
List<BlockWorkerInfo> workerInfoList = fsContext.getCachedWorkers();
if (workerInfoList.size() == 0) {
System.out.println("No workers found");
System.exit(1);
}
for (BlockWorkerInfo workerInfo : workerInfoList) {
WorkerNetAddress netAddress = workerInfo.getNetAddress();
TargetInfo worker = new TargetInfo(netAddress.getHost(), netAddress.getWebPort(), ROLE_WORKER);
targetInfoList.add(worker);
}
} else if (target.equals(ROLE_JOB_WORKERS)) {
if (jobClient == null) {
jobClient = JobMasterClient.Factory.create(JobMasterClientContext.newBuilder(clientContext).build());
}
List<JobWorkerHealth> jobWorkerInfoList = jobClient.getAllWorkerHealth();
if (jobWorkerInfoList.size() == 0) {
System.out.println("No job workers found");
System.exit(1);
}
int jobWorkerPort = conf.getInt(PropertyKey.JOB_WORKER_WEB_PORT);
for (JobWorkerHealth jobWorkerInfo : jobWorkerInfoList) {
String jobWorkerHost = jobWorkerInfo.getHostname();
TargetInfo jobWorker = new TargetInfo(jobWorkerHost, jobWorkerPort, ROLE_JOB_WORKER);
targetInfoList.add(jobWorker);
}
} else if (target.contains(":")) {
String[] hostPortPair = target.split(":");
int port = Integer.parseInt(hostPortPair[1]);
String role = inferRoleFromPort(port, conf);
LOG.debug("Port {} maps to role {}", port, role);
TargetInfo unspecifiedTarget = new TargetInfo(hostPortPair[0], port, role);
System.out.format("Role inferred from port: %s%n", unspecifiedTarget);
targetInfoList.add(unspecifiedTarget);
} else {
throw new IOException(String.format("Unrecognized target argument: %s. " + "Please pass the targets in the form of <host>:<port>, " + "with comma as the separator.", target));
}
}
return targetInfoList;
}
use of alluxio.job.wire.JobWorkerHealth in project alluxio by Alluxio.
the class CommandHandlingExecutor method heartbeat.
@Override
public void heartbeat() {
mHealthReporter.compute();
if (mHealthReporter.isHealthy()) {
mTaskExecutorManager.unthrottle();
} else {
mTaskExecutorManager.throttle();
}
JobWorkerHealth jobWorkerHealth = new JobWorkerHealth(JobWorkerIdRegistry.getWorkerId(), mHealthReporter.getCpuLoadAverage(), mTaskExecutorManager.getTaskExecutorPoolSize(), mTaskExecutorManager.getNumActiveTasks(), mTaskExecutorManager.unfinishedTasks(), mWorkerNetAddress.getHost());
List<TaskInfo> taskStatusList = mTaskExecutorManager.getAndClearTaskUpdates();
List<alluxio.grpc.JobCommand> commands;
List<JobInfo> taskProtoList = taskStatusList.stream().map(TaskInfo::toProto).collect(Collectors.toList());
try {
commands = mMasterClient.heartbeat(jobWorkerHealth, taskProtoList);
} catch (AlluxioException | IOException e) {
// Restore the task updates so that they can be accessed in the next heartbeat.
mTaskExecutorManager.restoreTaskUpdates(taskStatusList);
// TODO(yupeng) better error handling
LOG.error("Failed to heartbeat", e);
return;
}
for (JobCommand command : commands) {
mCommandHandlingService.execute(new CommandHandler(command));
}
}
use of alluxio.job.wire.JobWorkerHealth in project alluxio by Alluxio.
the class JobServiceMetricsCommand method run.
/**
* Runs a job services report metrics command.
*
* @return 0 on success, 1 otherwise
*/
public int run() throws IOException {
List<JobWorkerHealth> allWorkerHealth = mJobMasterClient.getAllWorkerHealth();
for (JobWorkerHealth workerHealth : allWorkerHealth) {
mPrintStream.print(String.format("Worker: %-10s ", workerHealth.getHostname()));
mPrintStream.print(String.format("Task Pool Size: %-7s", workerHealth.getTaskPoolSize()));
mPrintStream.print(String.format("Unfinished Tasks: %-7s", workerHealth.getUnfinishedTasks()));
mPrintStream.print(String.format("Active Tasks: %-7s", workerHealth.getNumActiveTasks()));
mPrintStream.println(String.format("Load Avg: %s", StringUtils.join(workerHealth.getLoadAverage(), ", ")));
}
mPrintStream.println();
JobServiceSummary jobServiceSummary = mJobMasterClient.getJobServiceSummary();
Collection<StatusSummary> jobStatusSummaries = jobServiceSummary.getSummaryPerStatus();
for (StatusSummary statusSummary : jobStatusSummaries) {
mPrintStream.print(String.format("Status: %-10s", statusSummary.getStatus()));
mPrintStream.println(String.format("Count: %s", statusSummary.getCount()));
}
mPrintStream.println();
mPrintStream.println(String.format("%s Most Recently Modified Jobs:", JobServiceSummary.RECENT_LENGTH));
List<JobInfo> lastActivities = jobServiceSummary.getRecentActivities();
printJobInfos(lastActivities);
mPrintStream.println(String.format("%s Most Recently Failed Jobs:", JobServiceSummary.RECENT_LENGTH));
List<JobInfo> lastFailures = jobServiceSummary.getRecentFailures();
printJobInfos(lastFailures);
mPrintStream.println(String.format("%s Longest Running Jobs:", JobServiceSummary.RECENT_LENGTH));
List<JobInfo> longestRunning = jobServiceSummary.getLongestRunning();
printJobInfos(longestRunning);
return 0;
}
Aggregations