Search in sources :

Example 1 with JobWorkerHealth

use of alluxio.job.wire.JobWorkerHealth in project alluxio by Alluxio.

the class LogLevelTest method parseJobWorkerTargets.

@Test
public void parseJobWorkerTargets() throws Exception {
    CommandLine mockCommandLine = mock(CommandLine.class);
    String[] mockArgs = new String[] { "--target", "job_workers" };
    when(mockCommandLine.getArgs()).thenReturn(mockArgs);
    when(mockCommandLine.hasOption(LogLevel.TARGET_OPTION_NAME)).thenReturn(true);
    when(mockCommandLine.getOptionValue(LogLevel.TARGET_OPTION_NAME)).thenReturn(mockArgs[1]);
    // Prepare a list of job workers
    List<JobWorkerHealth> jobWorkers = new ArrayList<>();
    jobWorkers.add(new JobWorkerHealth(0, new ArrayList<>(), 10, 0, 0, "workers-1"));
    jobWorkers.add(new JobWorkerHealth(1, new ArrayList<>(), 10, 0, 0, "workers-2"));
    PowerMockito.mockStatic(JobMasterClient.Factory.class);
    JobMasterClient mockJobClient = mock(JobMasterClient.class);
    when(mockJobClient.getAllWorkerHealth()).thenReturn(jobWorkers);
    when(JobMasterClient.Factory.create(any())).thenReturn(mockJobClient);
    List<LogLevel.TargetInfo> targets = LogLevel.parseOptTarget(mockCommandLine, mConf);
    assertEquals(2, targets.size());
    assertEquals(new LogLevel.TargetInfo("workers-1", JOB_WORKER_WEB_PORT, "job_worker"), targets.get(0));
    assertEquals(new LogLevel.TargetInfo("workers-2", JOB_WORKER_WEB_PORT, "job_worker"), targets.get(1));
}
Also used : JobMasterClient(alluxio.client.job.JobMasterClient) CommandLine(org.apache.commons.cli.CommandLine) ArrayList(java.util.ArrayList) JobWorkerHealth(alluxio.job.wire.JobWorkerHealth) PrepareForTest(org.powermock.core.classloader.annotations.PrepareForTest) Test(org.junit.Test)

Example 2 with JobWorkerHealth

use of alluxio.job.wire.JobWorkerHealth in project alluxio by Alluxio.

the class JobServiceMetricsCommandTest method testBasic.

@Test
public void testBasic() throws IOException, ParseException {
    JobWorkerHealth jobWorkerHealth = new JobWorkerHealth(1, Lists.newArrayList(1.2, 0.9, 0.7), 10, 2, 2, "testHost");
    Mockito.when(mJobMasterClient.getAllWorkerHealth()).thenReturn(Lists.newArrayList(jobWorkerHealth));
    List<JobInfo> jobInfos = new ArrayList<>();
    jobInfos.add(createJobInfo(1, "Test1", Status.RUNNING, "2019-10-17 12:00:00"));
    jobInfos.add(createJobInfo(2, "Test2", Status.FAILED, "2019-10-17 12:30:15"));
    Mockito.when(mJobMasterClient.getJobServiceSummary()).thenReturn(new JobServiceSummary(jobInfos));
    new JobServiceMetricsCommand(mJobMasterClient, mPrintStream, "MM-dd-yyyy HH:mm:ss:SSS").run();
    String output = new String(mOutputStream.toByteArray(), StandardCharsets.UTF_8);
    String[] lineByLine = output.split("\n");
    // Worker Health Section
    assertEquals("Worker: testHost    Task Pool Size: 10     Unfinished Tasks: 2" + "      Active Tasks: 2      Load Avg: 1.2, 0.9, 0.7", lineByLine[0]);
    assertEquals("", lineByLine[1]);
    // Group By Status
    lineByLine = ArrayUtils.subarray(lineByLine, 2, lineByLine.length);
    assertEquals("Status: CREATED   Count: 0", lineByLine[0]);
    assertEquals("Status: CANCELED  Count: 0", lineByLine[1]);
    assertEquals("Status: FAILED    Count: 1", lineByLine[2]);
    assertEquals("Status: RUNNING   Count: 1", lineByLine[3]);
    assertEquals("Status: COMPLETED Count: 0", lineByLine[4]);
    assertEquals("", lineByLine[5]);
    // Top 10
    lineByLine = ArrayUtils.subarray(lineByLine, 6, lineByLine.length);
    assertEquals("10 Most Recently Modified Jobs:", lineByLine[0]);
    assertEquals("Timestamp: 01-17-2019 12:30:15:000       Id: 2                   Name: Test2" + "               Status: FAILED", lineByLine[1]);
    assertEquals("Timestamp: 01-17-2019 12:00:00:000       Id: 1                   Name: Test1" + "               Status: RUNNING", lineByLine[2]);
    assertEquals("", lineByLine[3]);
    assertEquals("10 Most Recently Failed Jobs:", lineByLine[4]);
    assertEquals("Timestamp: 01-17-2019 12:30:15:000       Id: 2                   Name: Test2" + "               Status: FAILED", lineByLine[5]);
    assertEquals("", lineByLine[6]);
    assertEquals("10 Longest Running Jobs:", lineByLine[7]);
    assertEquals("Timestamp: 01-17-2019 12:00:00:000       Id: 1                   Name: Test1" + "               Status: RUNNING", lineByLine[8]);
}
Also used : JobInfo(alluxio.job.wire.JobInfo) ArrayList(java.util.ArrayList) JobWorkerHealth(alluxio.job.wire.JobWorkerHealth) JobServiceSummary(alluxio.job.wire.JobServiceSummary) Test(org.junit.Test)

Example 3 with JobWorkerHealth

use of alluxio.job.wire.JobWorkerHealth in project alluxio by Alluxio.

the class LogLevel method getTargetInfos.

private static List<TargetInfo> getTargetInfos(String[] targets, AlluxioConfiguration conf) throws IOException {
    // Trim the elements
    Set<String> targetSet = Arrays.stream(targets).map(String::trim).collect(Collectors.toSet());
    List<TargetInfo> targetInfoList = new ArrayList<>();
    // Allow plural form for the master/job_master and print a notice
    if (targetSet.contains(ROLE_MASTERS)) {
        System.out.println("The logLevel command will only take effect on the primary master, " + "instead of on all the masters. ");
        targetSet.remove(ROLE_MASTERS);
        targetSet.add(ROLE_MASTER);
        System.out.println("Target `masters` is replaced with `master`.");
    }
    if (targetSet.contains(ROLE_JOB_MASTERS)) {
        System.out.println("The logLevel command will only take effect on the primary job master, " + "instead of on all the masters. ");
        targetSet.remove(ROLE_JOB_MASTERS);
        targetSet.add(ROLE_JOB_MASTER);
        System.out.println("Target `job_masters` is replaced with `job_master`.");
    }
    ClientContext clientContext = ClientContext.create(conf);
    // Created only when needed by master and workers
    FileSystemContext fsContext = null;
    // Created only when needed by the job master and job workers
    JobMasterClient jobClient = null;
    // Process each target
    for (String target : targetSet) {
        if (target.isEmpty()) {
            continue;
        } else if (target.equals(ROLE_MASTER)) {
            if (fsContext == null) {
                fsContext = FileSystemContext.create(clientContext);
            }
            String masterHost = fsContext.getMasterAddress().getHostName();
            int masterPort = NetworkAddressUtils.getPort(ServiceType.MASTER_WEB, conf);
            TargetInfo master = new TargetInfo(masterHost, masterPort, ROLE_MASTER);
            targetInfoList.add(master);
        } else if (target.equals(ROLE_JOB_MASTER)) {
            if (jobClient == null) {
                jobClient = JobMasterClient.Factory.create(JobMasterClientContext.newBuilder(clientContext).build());
            }
            String jobMasterHost = jobClient.getAddress().getHostName();
            int jobMasterPort = NetworkAddressUtils.getPort(ServiceType.JOB_MASTER_WEB, conf);
            TargetInfo jobMaster = new TargetInfo(jobMasterHost, jobMasterPort, ROLE_JOB_MASTER);
            targetInfoList.add(jobMaster);
        } else if (target.equals(ROLE_WORKERS)) {
            if (fsContext == null) {
                fsContext = FileSystemContext.create(ClientContext.create(conf));
            }
            List<BlockWorkerInfo> workerInfoList = fsContext.getCachedWorkers();
            if (workerInfoList.size() == 0) {
                System.out.println("No workers found");
                System.exit(1);
            }
            for (BlockWorkerInfo workerInfo : workerInfoList) {
                WorkerNetAddress netAddress = workerInfo.getNetAddress();
                TargetInfo worker = new TargetInfo(netAddress.getHost(), netAddress.getWebPort(), ROLE_WORKER);
                targetInfoList.add(worker);
            }
        } else if (target.equals(ROLE_JOB_WORKERS)) {
            if (jobClient == null) {
                jobClient = JobMasterClient.Factory.create(JobMasterClientContext.newBuilder(clientContext).build());
            }
            List<JobWorkerHealth> jobWorkerInfoList = jobClient.getAllWorkerHealth();
            if (jobWorkerInfoList.size() == 0) {
                System.out.println("No job workers found");
                System.exit(1);
            }
            int jobWorkerPort = conf.getInt(PropertyKey.JOB_WORKER_WEB_PORT);
            for (JobWorkerHealth jobWorkerInfo : jobWorkerInfoList) {
                String jobWorkerHost = jobWorkerInfo.getHostname();
                TargetInfo jobWorker = new TargetInfo(jobWorkerHost, jobWorkerPort, ROLE_JOB_WORKER);
                targetInfoList.add(jobWorker);
            }
        } else if (target.contains(":")) {
            String[] hostPortPair = target.split(":");
            int port = Integer.parseInt(hostPortPair[1]);
            String role = inferRoleFromPort(port, conf);
            LOG.debug("Port {} maps to role {}", port, role);
            TargetInfo unspecifiedTarget = new TargetInfo(hostPortPair[0], port, role);
            System.out.format("Role inferred from port: %s%n", unspecifiedTarget);
            targetInfoList.add(unspecifiedTarget);
        } else {
            throw new IOException(String.format("Unrecognized target argument: %s. " + "Please pass the targets in the form of <host>:<port>, " + "with comma as the separator.", target));
        }
    }
    return targetInfoList;
}
Also used : JobMasterClient(alluxio.client.job.JobMasterClient) ClientContext(alluxio.ClientContext) JobMasterClientContext(alluxio.worker.job.JobMasterClientContext) ArrayList(java.util.ArrayList) IOException(java.io.IOException) WorkerNetAddress(alluxio.wire.WorkerNetAddress) BlockWorkerInfo(alluxio.client.block.BlockWorkerInfo) FileSystemContext(alluxio.client.file.FileSystemContext) JobWorkerHealth(alluxio.job.wire.JobWorkerHealth) ArrayList(java.util.ArrayList) List(java.util.List)

Example 4 with JobWorkerHealth

use of alluxio.job.wire.JobWorkerHealth in project alluxio by Alluxio.

the class CommandHandlingExecutor method heartbeat.

@Override
public void heartbeat() {
    mHealthReporter.compute();
    if (mHealthReporter.isHealthy()) {
        mTaskExecutorManager.unthrottle();
    } else {
        mTaskExecutorManager.throttle();
    }
    JobWorkerHealth jobWorkerHealth = new JobWorkerHealth(JobWorkerIdRegistry.getWorkerId(), mHealthReporter.getCpuLoadAverage(), mTaskExecutorManager.getTaskExecutorPoolSize(), mTaskExecutorManager.getNumActiveTasks(), mTaskExecutorManager.unfinishedTasks(), mWorkerNetAddress.getHost());
    List<TaskInfo> taskStatusList = mTaskExecutorManager.getAndClearTaskUpdates();
    List<alluxio.grpc.JobCommand> commands;
    List<JobInfo> taskProtoList = taskStatusList.stream().map(TaskInfo::toProto).collect(Collectors.toList());
    try {
        commands = mMasterClient.heartbeat(jobWorkerHealth, taskProtoList);
    } catch (AlluxioException | IOException e) {
        // Restore the task updates so that they can be accessed in the next heartbeat.
        mTaskExecutorManager.restoreTaskUpdates(taskStatusList);
        // TODO(yupeng) better error handling
        LOG.error("Failed to heartbeat", e);
        return;
    }
    for (JobCommand command : commands) {
        mCommandHandlingService.execute(new CommandHandler(command));
    }
}
Also used : TaskInfo(alluxio.job.wire.TaskInfo) JobInfo(alluxio.grpc.JobInfo) JobCommand(alluxio.grpc.JobCommand) JobWorkerHealth(alluxio.job.wire.JobWorkerHealth) IOException(java.io.IOException) AlluxioException(alluxio.exception.AlluxioException)

Example 5 with JobWorkerHealth

use of alluxio.job.wire.JobWorkerHealth in project alluxio by Alluxio.

the class JobServiceMetricsCommand method run.

/**
 * Runs a job services report metrics command.
 *
 * @return 0 on success, 1 otherwise
 */
public int run() throws IOException {
    List<JobWorkerHealth> allWorkerHealth = mJobMasterClient.getAllWorkerHealth();
    for (JobWorkerHealth workerHealth : allWorkerHealth) {
        mPrintStream.print(String.format("Worker: %-10s  ", workerHealth.getHostname()));
        mPrintStream.print(String.format("Task Pool Size: %-7s", workerHealth.getTaskPoolSize()));
        mPrintStream.print(String.format("Unfinished Tasks: %-7s", workerHealth.getUnfinishedTasks()));
        mPrintStream.print(String.format("Active Tasks: %-7s", workerHealth.getNumActiveTasks()));
        mPrintStream.println(String.format("Load Avg: %s", StringUtils.join(workerHealth.getLoadAverage(), ", ")));
    }
    mPrintStream.println();
    JobServiceSummary jobServiceSummary = mJobMasterClient.getJobServiceSummary();
    Collection<StatusSummary> jobStatusSummaries = jobServiceSummary.getSummaryPerStatus();
    for (StatusSummary statusSummary : jobStatusSummaries) {
        mPrintStream.print(String.format("Status: %-10s", statusSummary.getStatus()));
        mPrintStream.println(String.format("Count: %s", statusSummary.getCount()));
    }
    mPrintStream.println();
    mPrintStream.println(String.format("%s Most Recently Modified Jobs:", JobServiceSummary.RECENT_LENGTH));
    List<JobInfo> lastActivities = jobServiceSummary.getRecentActivities();
    printJobInfos(lastActivities);
    mPrintStream.println(String.format("%s Most Recently Failed Jobs:", JobServiceSummary.RECENT_LENGTH));
    List<JobInfo> lastFailures = jobServiceSummary.getRecentFailures();
    printJobInfos(lastFailures);
    mPrintStream.println(String.format("%s Longest Running Jobs:", JobServiceSummary.RECENT_LENGTH));
    List<JobInfo> longestRunning = jobServiceSummary.getLongestRunning();
    printJobInfos(longestRunning);
    return 0;
}
Also used : StatusSummary(alluxio.job.wire.StatusSummary) JobInfo(alluxio.job.wire.JobInfo) JobWorkerHealth(alluxio.job.wire.JobWorkerHealth) JobServiceSummary(alluxio.job.wire.JobServiceSummary)

Aggregations

JobWorkerHealth (alluxio.job.wire.JobWorkerHealth)6 ArrayList (java.util.ArrayList)3 Test (org.junit.Test)3 JobMasterClient (alluxio.client.job.JobMasterClient)2 JobInfo (alluxio.job.wire.JobInfo)2 JobServiceSummary (alluxio.job.wire.JobServiceSummary)2 IOException (java.io.IOException)2 List (java.util.List)2 ClientContext (alluxio.ClientContext)1 BlockWorkerInfo (alluxio.client.block.BlockWorkerInfo)1 FileSystemContext (alluxio.client.file.FileSystemContext)1 AlluxioException (alluxio.exception.AlluxioException)1 JobCommand (alluxio.grpc.JobCommand)1 JobInfo (alluxio.grpc.JobInfo)1 StatusSummary (alluxio.job.wire.StatusSummary)1 TaskInfo (alluxio.job.wire.TaskInfo)1 BaseIntegrationTest (alluxio.testutils.BaseIntegrationTest)1 WorkerNetAddress (alluxio.wire.WorkerNetAddress)1 JobMasterClientContext (alluxio.worker.job.JobMasterClientContext)1 AtomicReference (java.util.concurrent.atomic.AtomicReference)1