Search in sources :

Example 1 with JobFaultyException

use of edu.iu.dsc.tws.api.exceptions.JobFaultyException in project twister2 by DSC-SPIDAL.

the class ZKWorkerController method waitOnBarrier.

/**
 * All workers create a znode on the barrier directory
 * Job master watches znode creations/removals on this directory
 * when the number of znodes on that directory reaches the number of workers in the job,
 * Job master publishes AllArrivedOnBarrier event
 * Workers proceed when they get this event or when they time out
 * <p>
 * Workers remove their znodes after they proceed through the barrier
 * so that they can wait on the barrier again
 * Workers are responsible for creating and removing znodes on the barrier
 * Job master removes barrier znode after the job completion or scale down.
 *
 * if timeout is reached, throws TimeoutException.
 */
@Override
public void waitOnBarrier(long timeLimit) throws TimeoutException {
    // do not wait on the barrier
    if (JobProgress.isJobFaulty()) {
        throw new JobFaultyException("Can not wait on the barrier, since the job is faulty.");
    }
    defaultBarrierProceeded = false;
    try {
        ZKBarrierManager.createWorkerZNodeAtDefault(client, rootPath, jobID, workerInfo.getWorkerID(), timeLimit);
    } catch (Twister2Exception e) {
        LOG.log(Level.SEVERE, e.getMessage(), e);
        return;
    }
    // wait until all workers joined or time limit is reached
    long startTime = System.currentTimeMillis();
    long tl = timeLimit > Long.MAX_VALUE / 2 ? Long.MAX_VALUE : timeLimit * 2;
    long delay = 0;
    while (delay < tl) {
        synchronized (defaultBarrierWaitObject) {
            try {
                if (!defaultBarrierProceeded) {
                    defaultBarrierWaitObject.wait(tl - delay);
                    break;
                }
            } catch (InterruptedException e) {
                delay = System.currentTimeMillis() - startTime;
            }
        }
    }
    // delete barrier znode in any case
    try {
        ZKBarrierManager.deleteWorkerZNodeFromDefault(client, rootPath, jobID, workerInfo.getWorkerID());
    } catch (Twister2Exception e) {
        LOG.log(Level.SEVERE, e.getMessage(), e);
    }
    if (defaultBarrierProceeded) {
        if (defaultBarrierResult == JobMasterAPI.BarrierResult.SUCCESS) {
            return;
        } else if (defaultBarrierResult == JobMasterAPI.BarrierResult.JOB_FAULTY) {
            throw new JobFaultyException("Barrier broken since a fault occurred in the job.");
        } else if (defaultBarrierResult == JobMasterAPI.BarrierResult.TIMED_OUT) {
            throw new TimeoutException("Barrier timed out. Not all workers arrived on the time limit: " + timeLimit + "ms.");
        }
        // this should never happen, since we have only these three options
        return;
    } else {
        throw new TimeoutException("Barrier timed out on the worker. " + tl + "ms.");
    }
}
Also used : Twister2Exception(edu.iu.dsc.tws.api.exceptions.Twister2Exception) JobFaultyException(edu.iu.dsc.tws.api.exceptions.JobFaultyException) TimeoutException(edu.iu.dsc.tws.api.exceptions.TimeoutException)

Example 2 with JobFaultyException

use of edu.iu.dsc.tws.api.exceptions.JobFaultyException in project twister2 by DSC-SPIDAL.

the class ZKWorkerController method waitOnInitBarrier.

/**
 * init barrier
 * the same algorithm as the default barrier
 * @throws TimeoutException
 */
public void waitOnInitBarrier() throws TimeoutException {
    initBarrierProceeded = false;
    long timeLimit = ControllerContext.maxWaitTimeOnInitBarrier(config);
    try {
        ZKBarrierManager.createWorkerZNodeAtInit(client, rootPath, jobID, workerInfo.getWorkerID(), timeLimit);
    } catch (Twister2Exception e) {
        LOG.log(Level.SEVERE, e.getMessage(), e);
        return;
    }
    // wait until all workers joined or the time limit is reached
    long startTime = System.currentTimeMillis();
    long tl = timeLimit > Long.MAX_VALUE / 2 ? Long.MAX_VALUE : timeLimit * 2;
    long delay = 0;
    while (delay < tl) {
        synchronized (initBarrierWaitObject) {
            try {
                if (!initBarrierProceeded) {
                    initBarrierWaitObject.wait(tl - delay);
                    break;
                }
            } catch (InterruptedException e) {
                delay = System.currentTimeMillis() - startTime;
            }
        }
    }
    // delete barrier znode in any case
    try {
        ZKBarrierManager.deleteWorkerZNodeFromInit(client, rootPath, jobID, workerInfo.getWorkerID());
    } catch (Twister2Exception e) {
        LOG.log(Level.SEVERE, e.getMessage(), e);
    }
    if (initBarrierProceeded) {
        if (initBarrierResult == JobMasterAPI.BarrierResult.SUCCESS) {
            return;
        } else if (initBarrierResult == JobMasterAPI.BarrierResult.JOB_FAULTY) {
            throw new JobFaultyException("Barrier broken since a fault occurred in the job.");
        } else if (initBarrierResult == JobMasterAPI.BarrierResult.TIMED_OUT) {
            throw new TimeoutException("Barrier timed out. Not all workers arrived on the time limit: " + timeLimit + "ms.");
        }
        // this should never happen, since we have only these three options
        return;
    } else {
        throw new TimeoutException("Barrier timed out on the worker. " + tl + "ms.");
    }
}
Also used : Twister2Exception(edu.iu.dsc.tws.api.exceptions.Twister2Exception) JobFaultyException(edu.iu.dsc.tws.api.exceptions.JobFaultyException) TimeoutException(edu.iu.dsc.tws.api.exceptions.TimeoutException)

Example 3 with JobFaultyException

use of edu.iu.dsc.tws.api.exceptions.JobFaultyException in project twister2 by DSC-SPIDAL.

the class JMWorkerController method sendBarrierRequest.

private void sendBarrierRequest(JobMasterAPI.BarrierType barrierType, long timeLimit) throws TimeoutException {
    JobMasterAPI.BarrierRequest barrierRequest = JobMasterAPI.BarrierRequest.newBuilder().setWorkerID(workerInfo.getWorkerID()).setBarrierType(barrierType).setTimeout(timeLimit).build();
    LOG.fine("Sending BarrierRequest message: \n" + barrierRequest.toString());
    try {
        // set the local wait time for the barrier response to (2 * timeLimit)
        // if the requested time limit is more than half of the long max value,
        // set it to the long max value
        long tl = timeLimit > Long.MAX_VALUE / 2 ? Long.MAX_VALUE : timeLimit * 2;
        Tuple<RequestID, Message> response = rrClient.sendRequestWaitResponse(barrierRequest, tl);
        JobMasterAPI.BarrierResponse barrierResponse = (JobMasterAPI.BarrierResponse) response.getValue();
        if (barrierResponse.getResult() == JobMasterAPI.BarrierResult.SUCCESS) {
            return;
        } else if (barrierResponse.getResult() == JobMasterAPI.BarrierResult.JOB_FAULTY) {
            throw new JobFaultyException("Job became faulty and Default Barrier failed.");
        } else if (barrierResponse.getResult() == JobMasterAPI.BarrierResult.TIMED_OUT) {
            throw new TimeoutException("Barrier timed out. Not all workers arrived at the barrier " + "on the time limit: " + timeLimit + "ms");
        }
    } catch (BlockingSendException e) {
        throw new TimeoutException("Not all workers arrived at the barrier on the time limit: " + timeLimit + "ms.", e);
    }
}
Also used : JobMasterAPI(edu.iu.dsc.tws.proto.jobmaster.JobMasterAPI) BlockingSendException(edu.iu.dsc.tws.api.exceptions.net.BlockingSendException) RequestID(edu.iu.dsc.tws.api.net.request.RequestID) Message(com.google.protobuf.Message) JobFaultyException(edu.iu.dsc.tws.api.exceptions.JobFaultyException) TimeoutException(edu.iu.dsc.tws.api.exceptions.TimeoutException)

Example 4 with JobFaultyException

use of edu.iu.dsc.tws.api.exceptions.JobFaultyException in project twister2 by DSC-SPIDAL.

the class MPIWorkerManager method execute.

public boolean execute(Config config, JobAPI.Job job, IWorkerController workerController, IPersistentVolume persistentVolume, IVolatileVolume volatileVolume, IWorker managedWorker) {
    int workerID = workerController.getWorkerInfo().getWorkerID();
    LOG.info("Waiting on the init barrier before starting IWorker: " + workerID + " with restartCount: " + workerController.workerRestartCount() + " and with re-executionCount: " + JobProgress.getWorkerExecuteCount());
    try {
        workerController.waitOnInitBarrier();
        firstInitBarrierProceeded = true;
    } catch (TimeoutException e) {
        throw new Twister2RuntimeException("Could not pass through the init barrier", e);
    }
    // if it is executing for the first time, release worker ports
    if (JobProgress.getWorkerExecuteCount() == 0) {
        NetworkUtils.releaseWorkerPorts();
    }
    JobProgressImpl.setJobStatus(JobProgress.JobStatus.EXECUTING);
    JobProgressImpl.increaseWorkerExecuteCount();
    try {
        managedWorker.execute(config, job, workerController, persistentVolume, volatileVolume);
        return true;
    } catch (JobFaultyException jfe) {
        // a worker in the cluster should have failed
        JobProgressImpl.setJobStatus(JobProgress.JobStatus.FAULTY);
        throw jfe;
    }
}
Also used : Twister2RuntimeException(edu.iu.dsc.tws.api.exceptions.Twister2RuntimeException) JobFaultyException(edu.iu.dsc.tws.api.exceptions.JobFaultyException) TimeoutException(edu.iu.dsc.tws.api.exceptions.TimeoutException)

Example 5 with JobFaultyException

use of edu.iu.dsc.tws.api.exceptions.JobFaultyException in project twister2 by DSC-SPIDAL.

the class WorkerManager method execute.

/**
 * Execute IWorker
 * return false if IWorker fails fully after retries
 * return true if execution successful
 * throw an exception if execution fails and the worker needs to be restarted from jvm
 */
public boolean execute() {
    while (JobProgress.getWorkerExecuteCount() < maxRetries) {
        LOG.info("Waiting on the init barrier before starting IWorker: " + workerID + " with restartCount: " + workerController.workerRestartCount() + " and with re-executionCount: " + JobProgress.getWorkerExecuteCount());
        try {
            workerController.waitOnInitBarrier();
            firstInitBarrierProceeded = true;
        } catch (TimeoutException e) {
            throw new Twister2RuntimeException("Could not pass through the init barrier", e);
        }
        LOG.fine("Proceeded through INIT barrier. Starting Worker: " + workerID);
        JobProgressImpl.setJobStatus(JobProgress.JobStatus.EXECUTING);
        JobProgressImpl.increaseWorkerExecuteCount();
        JobProgressImpl.setRestartedWorkers(restartedWorkers.values());
        try {
            managedWorker.execute(config, job, workerController, persistentVolume, volatileVolume);
        } catch (JobFaultyException cue) {
            // a worker in the cluster should have failed
            // we will try to re-execute this worker
            JobProgressImpl.setJobStatus(JobProgress.JobStatus.FAULTY);
            LOG.warning("thrown JobFaultyException. Some workers should have failed.");
        }
        // we need to make sure whether that all workers finished successfully also
        if (JobProgress.isJobHealthy()) {
            try {
                // wait on the barrier indefinitely until all workers arrive
                // or the barrier is broken with with a job fault
                LOG.info("Worker completed, waiting for other workers to finish at the final barrier.");
                workerController.waitOnBarrier(Long.MAX_VALUE);
                LOG.info("Worker finished successfully");
                return true;
            } catch (TimeoutException e) {
                // this should never happen
                throw new Twister2RuntimeException("Could not pass through the final barrier", e);
            } catch (JobFaultyException e) {
                JobProgressImpl.setJobStatus(JobProgress.JobStatus.FAULTY);
                LOG.warning("thrown JobFaultyException. Some workers failed before finishing.");
            }
        }
    }
    LOG.info(String.format("Re-executed IWorker %d times and failed, we are exiting", maxRetries));
    return false;
}
Also used : Twister2RuntimeException(edu.iu.dsc.tws.api.exceptions.Twister2RuntimeException) JobFaultyException(edu.iu.dsc.tws.api.exceptions.JobFaultyException) TimeoutException(edu.iu.dsc.tws.api.exceptions.TimeoutException)

Aggregations

JobFaultyException (edu.iu.dsc.tws.api.exceptions.JobFaultyException)5 TimeoutException (edu.iu.dsc.tws.api.exceptions.TimeoutException)5 Twister2Exception (edu.iu.dsc.tws.api.exceptions.Twister2Exception)2 Twister2RuntimeException (edu.iu.dsc.tws.api.exceptions.Twister2RuntimeException)2 Message (com.google.protobuf.Message)1 BlockingSendException (edu.iu.dsc.tws.api.exceptions.net.BlockingSendException)1 RequestID (edu.iu.dsc.tws.api.net.request.RequestID)1 JobMasterAPI (edu.iu.dsc.tws.proto.jobmaster.JobMasterAPI)1