Search in sources :

Example 6 with PlanCoordinator

use of alluxio.master.job.plan.PlanCoordinator in project alluxio by Alluxio.

the class JobMaster method start.

@Override
public void start(Boolean isLeader) throws IOException {
    super.start(isLeader);
    // Fail any jobs that were still running when the last job master stopped.
    for (PlanCoordinator planCoordinator : mPlanTracker.coordinators()) {
        if (!planCoordinator.isJobFinished()) {
            planCoordinator.setJobAsFailed("JobMasterShutdown", "Job failed: Job master shut down during execution");
        }
    }
    if (isLeader) {
        getExecutorService().submit(new HeartbeatThread(HeartbeatContext.JOB_MASTER_LOST_WORKER_DETECTION, new LostWorkerDetectionHeartbeatExecutor(), (int) ServerConfiguration.getMs(PropertyKey.JOB_MASTER_LOST_WORKER_INTERVAL), ServerConfiguration.global(), mMasterContext.getUserState()));
        if (ServerConfiguration.getBoolean(PropertyKey.MASTER_AUDIT_LOGGING_ENABLED)) {
            mAsyncAuditLogWriter = new AsyncUserAccessAuditLogWriter("JOB_MASTER_AUDIT_LOG");
            mAsyncAuditLogWriter.start();
            MetricsSystem.registerGaugeIfAbsent(MetricKey.MASTER_AUDIT_LOG_ENTRIES_SIZE.getName(), () -> mAsyncAuditLogWriter != null ? mAsyncAuditLogWriter.getAuditLogEntriesSize() : -1);
        }
    }
}
Also used : HeartbeatThread(alluxio.heartbeat.HeartbeatThread) AsyncUserAccessAuditLogWriter(alluxio.master.audit.AsyncUserAccessAuditLogWriter) PlanCoordinator(alluxio.master.job.plan.PlanCoordinator)

Example 7 with PlanCoordinator

use of alluxio.master.job.plan.PlanCoordinator in project alluxio by Alluxio.

the class JobMaster method listDetailed.

/**
 * @return list of all job infos
 */
public List<JobInfo> listDetailed() {
    try (JobMasterAuditContext auditContext = createAuditContext("listDetailed")) {
        List<JobInfo> jobInfos = new ArrayList<>();
        for (PlanCoordinator coordinator : mPlanTracker.coordinators()) {
            jobInfos.add(coordinator.getPlanInfoWire(false));
        }
        jobInfos.addAll(mWorkflowTracker.getAllInfo());
        jobInfos.sort(Comparator.comparingLong(JobInfo::getId));
        auditContext.setSucceeded(true);
        return jobInfos;
    }
}
Also used : JobInfo(alluxio.job.wire.JobInfo) ArrayList(java.util.ArrayList) PlanCoordinator(alluxio.master.job.plan.PlanCoordinator)

Example 8 with PlanCoordinator

use of alluxio.master.job.plan.PlanCoordinator in project alluxio by Alluxio.

the class JobMaster method registerWorker.

/**
 * Returns a worker id for the given worker.
 *
 * @param workerNetAddress the worker {@link WorkerNetAddress}
 * @return the worker id for this worker
 */
public long registerWorker(WorkerNetAddress workerNetAddress) {
    // Run under exclusive lock for mWorkers
    try (LockResource workersLockExclusive = new LockResource(mWorkerRWLock.writeLock())) {
        // Check if worker has already been registered with this job master
        if (mWorkers.contains(mAddressIndex, workerNetAddress)) {
            // If the worker is trying to re-register, it must have died and been restarted. We need to
            // clean up the dead worker.
            LOG.info("Worker at address {} is re-registering. Failing tasks for previous worker at that " + "address", workerNetAddress);
            MasterWorkerInfo deadWorker = mWorkers.getFirstByField(mAddressIndex, workerNetAddress);
            for (PlanCoordinator planCoordinator : mPlanTracker.coordinators()) {
                planCoordinator.failTasksForWorker(deadWorker.getId());
            }
            mWorkerHealth.remove(deadWorker.getId());
            mWorkers.remove(deadWorker);
        }
        // Generate a new worker id.
        long workerId = mNextWorkerId.getAndIncrement();
        mWorkers.add(new MasterWorkerInfo(workerId, workerNetAddress));
        LOG.info("registerWorker(): WorkerNetAddress: {} id: {}", workerNetAddress, workerId);
        return workerId;
    }
}
Also used : LockResource(alluxio.resource.LockResource) MasterWorkerInfo(alluxio.job.MasterWorkerInfo) PlanCoordinator(alluxio.master.job.plan.PlanCoordinator)

Example 9 with PlanCoordinator

use of alluxio.master.job.plan.PlanCoordinator in project alluxio by Alluxio.

the class JobMaster method workerHeartbeat.

/**
 * Updates the tasks' status when a worker periodically heartbeats with the master, and sends the
 * commands for the worker to execute.
 *
 * @param jobWorkerHealth the job worker health info
 * @param taskInfoList the list of the task information
 * @return the list of {@link JobCommand} to the worker
 */
public List<JobCommand> workerHeartbeat(JobWorkerHealth jobWorkerHealth, List<TaskInfo> taskInfoList) throws ResourceExhaustedException {
    long workerId = jobWorkerHealth.getWorkerId();
    String hostname;
    // Run under shared lock for mWorkers
    try (LockResource workersLockShared = new LockResource(mWorkerRWLock.readLock())) {
        MasterWorkerInfo worker = mWorkers.getFirstByField(mIdIndex, workerId);
        if (worker == null) {
            return Collections.singletonList(JobCommand.newBuilder().setRegisterCommand(RegisterCommand.getDefaultInstance()).build());
        }
        hostname = worker.getWorkerAddress().getHost();
        // Update last-update-time of this particular worker under lock
        // to prevent lost worker detector clearing it under race
        worker.updateLastUpdatedTimeMs();
    }
    mWorkerHealth.put(workerId, jobWorkerHealth);
    // Update task infos for all jobs involved
    Map<Long, List<TaskInfo>> taskInfosPerJob = new HashMap<>();
    for (TaskInfo taskInfo : taskInfoList) {
        taskInfo.setWorkerHost(hostname);
        if (!taskInfosPerJob.containsKey(taskInfo.getJobId())) {
            taskInfosPerJob.put(taskInfo.getJobId(), new ArrayList());
        }
        taskInfosPerJob.get(taskInfo.getJobId()).add(taskInfo);
    }
    for (Map.Entry<Long, List<TaskInfo>> taskInfosPair : taskInfosPerJob.entrySet()) {
        PlanCoordinator planCoordinator = mPlanTracker.getCoordinator(taskInfosPair.getKey());
        if (planCoordinator != null) {
            planCoordinator.updateTasks(taskInfosPair.getValue());
        }
    }
    return mCommandManager.pollAllPendingCommands(workerId);
}
Also used : ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) TaskInfo(alluxio.job.wire.TaskInfo) LockResource(alluxio.resource.LockResource) MasterWorkerInfo(alluxio.job.MasterWorkerInfo) AtomicLong(java.util.concurrent.atomic.AtomicLong) List(java.util.List) ArrayList(java.util.ArrayList) Map(java.util.Map) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashMap(java.util.HashMap) PlanCoordinator(alluxio.master.job.plan.PlanCoordinator)

Aggregations

PlanCoordinator (alluxio.master.job.plan.PlanCoordinator)9 JobConfig (alluxio.job.JobConfig)4 JobServerContext (alluxio.job.JobServerContext)4 SleepJobConfig (alluxio.job.SleepJobConfig)4 CommandManager (alluxio.master.job.command.CommandManager)4 ArrayList (java.util.ArrayList)4 Consumer (java.util.function.Consumer)4 Test (org.junit.Test)4 PrepareForTest (org.powermock.core.classloader.annotations.PrepareForTest)4 TestPlanConfig (alluxio.job.TestPlanConfig)3 MasterWorkerInfo (alluxio.job.MasterWorkerInfo)2 LockResource (alluxio.resource.LockResource)2 ArgumentMatchers.anyLong (org.mockito.ArgumentMatchers.anyLong)2 JobDoesNotExistException (alluxio.exception.JobDoesNotExistException)1 ResourceExhaustedException (alluxio.exception.status.ResourceExhaustedException)1 HeartbeatThread (alluxio.heartbeat.HeartbeatThread)1 JobInfo (alluxio.job.wire.JobInfo)1 TaskInfo (alluxio.job.wire.TaskInfo)1 AsyncUserAccessAuditLogWriter (alluxio.master.audit.AsyncUserAccessAuditLogWriter)1 HashMap (java.util.HashMap)1