Search in sources :

Example 1 with ProvisioningTaskInfo

use of io.cdap.cdap.internal.provision.ProvisioningTaskInfo in project cdap by caskdata.

the class ProvisioningTask method executeOnce.

/**
 * Executes one iteration of subtask. It persists task info before each subtask such that this task
 * can be re-created from the task info stored in the ProvisionerStore.
 */
@Override
public final long executeOnce() throws Exception {
    RetryStrategy retryStrategy = getRetryStrategy();
    Map<ProvisioningOp.Status, ProvisioningSubtask> subTasks = getSubTasks();
    ProvisioningTaskInfo currentTaskInfo = persistTaskInfo(taskInfo, retryStrategy);
    ProvisioningOp.Status state = currentTaskInfo.getProvisioningOp().getStatus();
    if (state == ProvisioningOp.Status.CANCELLED) {
        LOG.debug("Cancelled {} task for program run {}.", initialTaskInfo.getProvisioningOp().getType(), programRunId);
        return -1L;
    }
    // Get the sub-task to execute
    ProvisioningSubtask subtask = subTasks.get(state);
    if (subtask == null) {
        // should never happen
        throw new IllegalStateException(String.format("Invalid state '%s' in provisioning task for program run '%s'. " + "This means there is a bug in provisioning state machine. " + "Please reach out to the development team.", state, programRunId));
    }
    if (subtask == EndSubtask.INSTANCE) {
        LOG.debug("Completed {} task for program run {}.", initialTaskInfo.getProvisioningOp().getType(), programRunId);
        return -1L;
    }
    if (subTaskStartTime == 0L) {
        subTaskStartTime = System.currentTimeMillis();
    }
    try {
        PROGRESS_LOG.debug("Executing {} subtask {} for program run {}.", currentTaskInfo.getProvisioningOp().getType(), state, programRunId);
        taskInfo = Retries.callWithInterruptibleRetries(() -> subtask.execute(currentTaskInfo), retryStrategy, t -> t instanceof RetryableProvisionException).orElse(null);
        PROGRESS_LOG.debug("Completed {} subtask {} for program run {}.", currentTaskInfo.getProvisioningOp().getType(), state, programRunId);
        // Nothing more to execute
        if (taskInfo == null) {
            LOG.debug("No more {} task for program run {}.", initialTaskInfo.getProvisioningOp().getType(), programRunId);
            return -1L;
        }
        ProvisioningOp.Status nextState = taskInfo.getProvisioningOp().getStatus();
        // If state doesn't change, determine the delay based on the polling strategy
        if (state == nextState) {
            if (subTaskPollingStrategy == null) {
                subTaskPollingStrategy = provisioner.getPollingStrategy(provisionerContext, taskInfo.getCluster());
            }
            return Math.max(0, subTaskPollingStrategy.nextPoll(subTaskExecNums++, subTaskStartTime));
        }
        // Otherwise, execute the next task immediately.
        subTaskPollingStrategy = null;
        subTaskStartTime = 0L;
        subTaskExecNums = 0;
        return 0;
    } catch (InterruptedException e) {
        throw e;
    } catch (Exception e) {
        LOG.error("{} task failed in {} state for program run {} due to {}.", currentTaskInfo.getProvisioningOp().getType(), state, programRunId, e.getMessage(), e);
        handleSubtaskFailure(currentTaskInfo, e);
        ProvisioningOp failureOp = new ProvisioningOp(currentTaskInfo.getProvisioningOp().getType(), ProvisioningOp.Status.FAILED);
        ProvisioningTaskInfo failureInfo = new ProvisioningTaskInfo(currentTaskInfo, failureOp, currentTaskInfo.getCluster());
        persistTaskInfo(failureInfo, retryStrategy);
        LOG.debug("Terminated {} task for program run {} due to exception.", initialTaskInfo.getProvisioningOp().getType(), programRunId);
        return -1L;
    }
}
Also used : RetryableProvisionException(io.cdap.cdap.runtime.spi.provisioner.RetryableProvisionException) ProvisioningTaskInfo(io.cdap.cdap.internal.provision.ProvisioningTaskInfo) ProvisioningOp(io.cdap.cdap.internal.provision.ProvisioningOp) RetryStrategy(io.cdap.cdap.common.service.RetryStrategy) RetryableProvisionException(io.cdap.cdap.runtime.spi.provisioner.RetryableProvisionException)

Example 2 with ProvisioningTaskInfo

use of io.cdap.cdap.internal.provision.ProvisioningTaskInfo in project cdap by caskdata.

the class ProgramLifecycleService method issueStop.

/**
 * Issues a command to stop the specified {@link RunId} of the specified {@link ProgramId} and returns a
 * {@link ListenableFuture} with the {@link ProgramRunId} for the runs that were stopped.
 * Clients can wait for completion of the {@link ListenableFuture}.
 *
 * @param programId the {@link ProgramId program} to issue a stop for
 * @param runId the runId of the program run to stop. If null, all runs of the program as returned by
 *              {@link ProgramRuntimeService} are stopped.
 * @return a list of {@link ListenableFuture} with the {@link ProgramRunId} that clients can wait on for stop
 *         to complete.
 * @throws NotFoundException if the app, program or run was not found
 * @throws BadRequestException if an attempt is made to stop a program that is either not running or
 *                             was started by a workflow
 * @throws UnauthorizedException if the user issuing the command is not authorized to stop the program. To stop a
 *                               program, a user requires {@link ApplicationPermission#EXECUTE} permission on
 *                               the program.
 */
public List<ListenableFuture<ProgramRunId>> issueStop(ProgramId programId, @Nullable String runId) throws Exception {
    accessEnforcer.enforce(programId, authenticationContext.getPrincipal(), ApplicationPermission.EXECUTE);
    // See if the program is running as per the runtime service
    Map<RunId, RuntimeInfo> runtimeInfos = findRuntimeInfo(programId, runId);
    Map<ProgramRunId, RunRecordDetail> activeRunRecords = getActiveRuns(programId, runId);
    if (runtimeInfos.isEmpty() && activeRunRecords.isEmpty()) {
        // Error out if no run information from runtime service and from run record
        Store.ensureProgramExists(programId, store.getApplication(programId.getParent()));
        throw new BadRequestException(String.format("Program '%s' is not running.", programId));
    }
    // Stop the running program based on a combination of runtime info and run record
    // It's possible that some of them are not yet available from the runtimeService due to timing
    // differences between the run record was created vs being added to runtimeService
    // So we retry in a loop for up to 3 seconds max to cater for those cases
    Set<String> pendingStops = Stream.concat(runtimeInfos.keySet().stream().map(RunId::getId), activeRunRecords.keySet().stream().map(ProgramRunId::getRun)).collect(Collectors.toSet());
    List<ListenableFuture<ProgramRunId>> futures = new ArrayList<>();
    Stopwatch stopwatch = new Stopwatch().start();
    Set<ProgramRunId> cancelledProvisionRuns = new HashSet<>();
    while (!pendingStops.isEmpty() && stopwatch.elapsedTime(TimeUnit.SECONDS) < 3L) {
        Iterator<String> iterator = pendingStops.iterator();
        while (iterator.hasNext()) {
            ProgramRunId activeRunId = programId.run(iterator.next());
            RunRecordDetail runRecord = activeRunRecords.get(activeRunId);
            if (runRecord == null) {
                runRecord = store.getRun(activeRunId);
            }
            // Check if the program is actually started from workflow and the workflow is running
            if (runRecord != null && runRecord.getProperties().containsKey("workflowrunid") && runRecord.getStatus().equals(ProgramRunStatus.RUNNING)) {
                String workflowRunId = runRecord.getProperties().get("workflowrunid");
                throw new BadRequestException(String.format("Cannot stop the program '%s' started by the Workflow " + "run '%s'. Please stop the Workflow.", activeRunId, workflowRunId));
            }
            RuntimeInfo runtimeInfo = runtimeService.lookup(programId, RunIds.fromString(activeRunId.getRun()));
            // if there is a runtimeInfo, the run is in the 'starting' state or later
            if (runtimeInfo != null) {
                ListenableFuture<ProgramController> future = runtimeInfo.getController().stop();
                futures.add(Futures.transform(future, ProgramController::getProgramRunId));
                iterator.remove();
                // if it was in this set, it means we cancelled a task, but it had already sent a PROVISIONED message
                // by the time we cancelled it. We then waited for it to show up in the runtime service and got here.
                // We added a future for this run in the lines above, but we don't want to add another duplicate future
                // at the end of this loop, so remove this run from the cancelled provision runs.
                cancelledProvisionRuns.remove(activeRunId);
            } else {
                // if there is no runtimeInfo, the run could be in the provisioning state.
                Optional<ProvisioningTaskInfo> cancelledInfo = provisioningService.cancelProvisionTask(activeRunId);
                cancelledInfo.ifPresent(taskInfo -> {
                    cancelledProvisionRuns.add(activeRunId);
                    // This state check is to handle a race condition where we cancel the provision task, but not in time
                    // to prevent it from sending the PROVISIONED notification.
                    // If the notification was sent, but not yet consumed, we are *not* done stopping the run.
                    // We have to wait for the notification to be consumed, which will start the run, and place the controller
                    // in the runtimeService. The next time we loop, we can find it in the runtimeService and tell it to stop.
                    // If the notification was not sent, then we *are* done stopping the run.
                    // Therefore, if the state is CREATED, we don't remove it from the iterator so that the run will get
                    // checked again in the next loop, when we may get the controller from the runtimeService to stop it.
                    // No other task states have this race condition, as the PROVISIONED notification is only sent
                    // after the state transitions to CREATED. Therefore it is safe to remove the runId from the iterator,
                    // as we know we are done stopping it.
                    ProvisioningOp.Status taskState = taskInfo.getProvisioningOp().getStatus();
                    if (taskState != ProvisioningOp.Status.CREATED) {
                        iterator.remove();
                    }
                });
            }
        }
        if (!pendingStops.isEmpty()) {
            // If not able to stop all of them, it means there were some runs that didn't have a runtime info and
            // didn't have a provisioning task. This can happen if the run was already finished, or the run transitioned
            // from the provisioning state to the starting state during this stop operation.
            // We'll get the active runs again and filter it by the pending stops. Stop will be retried for those.
            Set<String> finalPendingStops = pendingStops;
            activeRunRecords = getActiveRuns(programId, runId).entrySet().stream().filter(e -> finalPendingStops.contains(e.getKey().getRun())).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
            pendingStops = activeRunRecords.keySet().stream().map(ProgramRunId::getRun).collect(Collectors.toSet());
            if (!pendingStops.isEmpty()) {
                TimeUnit.MILLISECONDS.sleep(200);
            }
        }
    }
    for (ProgramRunId cancelledProvisionRun : cancelledProvisionRuns) {
        SettableFuture<ProgramRunId> future = SettableFuture.create();
        future.set(cancelledProvisionRun);
        futures.add(future);
    }
    return futures;
}
Also used : ProgramController(io.cdap.cdap.app.runtime.ProgramController) RuntimeInfo(io.cdap.cdap.app.runtime.ProgramRuntimeService.RuntimeInfo) RunRecordDetail(io.cdap.cdap.internal.app.store.RunRecordDetail) ArrayList(java.util.ArrayList) Stopwatch(com.google.common.base.Stopwatch) ProvisioningTaskInfo(io.cdap.cdap.internal.provision.ProvisioningTaskInfo) LogEntry(org.apache.twill.api.logging.LogEntry) BadRequestException(io.cdap.cdap.common.BadRequestException) ListenableFuture(com.google.common.util.concurrent.ListenableFuture) ProgramRunId(io.cdap.cdap.proto.id.ProgramRunId) ProvisioningOp(io.cdap.cdap.internal.provision.ProvisioningOp) RunId(org.apache.twill.api.RunId) ProgramRunId(io.cdap.cdap.proto.id.ProgramRunId) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet)

Example 3 with ProvisioningTaskInfo

use of io.cdap.cdap.internal.provision.ProvisioningTaskInfo in project cdap by caskdata.

the class ProvisioningSubtask method execute.

/**
 * Executes the subtask and returns the next subtask that should be executed if there is one.
 *
 * @param taskInfo information about the task being executed, including the current cluster state
 * @return task info to be sent to the next subtask if there is one
 * @throws Exception if there was an error executing the subtask
 */
public Optional<ProvisioningTaskInfo> execute(ProvisioningTaskInfo taskInfo) throws Exception {
    Cluster cluster = taskInfo.getCluster();
    Cluster nextCluster = execute(cluster);
    return transition.apply(nextCluster).map(nextState -> {
        ProvisioningOp nextOp = new ProvisioningOp(taskInfo.getProvisioningOp().getType(), nextState);
        return new ProvisioningTaskInfo(taskInfo, nextOp, nextCluster);
    });
}
Also used : ProvisioningTaskInfo(io.cdap.cdap.internal.provision.ProvisioningTaskInfo) Cluster(io.cdap.cdap.runtime.spi.provisioner.Cluster) ProvisioningOp(io.cdap.cdap.internal.provision.ProvisioningOp)

Aggregations

ProvisioningOp (io.cdap.cdap.internal.provision.ProvisioningOp)3 ProvisioningTaskInfo (io.cdap.cdap.internal.provision.ProvisioningTaskInfo)3 Stopwatch (com.google.common.base.Stopwatch)1 ListenableFuture (com.google.common.util.concurrent.ListenableFuture)1 ProgramController (io.cdap.cdap.app.runtime.ProgramController)1 RuntimeInfo (io.cdap.cdap.app.runtime.ProgramRuntimeService.RuntimeInfo)1 BadRequestException (io.cdap.cdap.common.BadRequestException)1 RetryStrategy (io.cdap.cdap.common.service.RetryStrategy)1 RunRecordDetail (io.cdap.cdap.internal.app.store.RunRecordDetail)1 ProgramRunId (io.cdap.cdap.proto.id.ProgramRunId)1 Cluster (io.cdap.cdap.runtime.spi.provisioner.Cluster)1 RetryableProvisionException (io.cdap.cdap.runtime.spi.provisioner.RetryableProvisionException)1 ArrayList (java.util.ArrayList)1 HashSet (java.util.HashSet)1 LinkedHashSet (java.util.LinkedHashSet)1 RunId (org.apache.twill.api.RunId)1 LogEntry (org.apache.twill.api.logging.LogEntry)1