use of io.cdap.cdap.internal.provision.ProvisioningTaskInfo in project cdap by caskdata.
the class ProvisioningTask method executeOnce.
/**
* Executes one iteration of subtask. It persists task info before each subtask such that this task
* can be re-created from the task info stored in the ProvisionerStore.
*/
@Override
public final long executeOnce() throws Exception {
RetryStrategy retryStrategy = getRetryStrategy();
Map<ProvisioningOp.Status, ProvisioningSubtask> subTasks = getSubTasks();
ProvisioningTaskInfo currentTaskInfo = persistTaskInfo(taskInfo, retryStrategy);
ProvisioningOp.Status state = currentTaskInfo.getProvisioningOp().getStatus();
if (state == ProvisioningOp.Status.CANCELLED) {
LOG.debug("Cancelled {} task for program run {}.", initialTaskInfo.getProvisioningOp().getType(), programRunId);
return -1L;
}
// Get the sub-task to execute
ProvisioningSubtask subtask = subTasks.get(state);
if (subtask == null) {
// should never happen
throw new IllegalStateException(String.format("Invalid state '%s' in provisioning task for program run '%s'. " + "This means there is a bug in provisioning state machine. " + "Please reach out to the development team.", state, programRunId));
}
if (subtask == EndSubtask.INSTANCE) {
LOG.debug("Completed {} task for program run {}.", initialTaskInfo.getProvisioningOp().getType(), programRunId);
return -1L;
}
if (subTaskStartTime == 0L) {
subTaskStartTime = System.currentTimeMillis();
}
try {
PROGRESS_LOG.debug("Executing {} subtask {} for program run {}.", currentTaskInfo.getProvisioningOp().getType(), state, programRunId);
taskInfo = Retries.callWithInterruptibleRetries(() -> subtask.execute(currentTaskInfo), retryStrategy, t -> t instanceof RetryableProvisionException).orElse(null);
PROGRESS_LOG.debug("Completed {} subtask {} for program run {}.", currentTaskInfo.getProvisioningOp().getType(), state, programRunId);
// Nothing more to execute
if (taskInfo == null) {
LOG.debug("No more {} task for program run {}.", initialTaskInfo.getProvisioningOp().getType(), programRunId);
return -1L;
}
ProvisioningOp.Status nextState = taskInfo.getProvisioningOp().getStatus();
// If state doesn't change, determine the delay based on the polling strategy
if (state == nextState) {
if (subTaskPollingStrategy == null) {
subTaskPollingStrategy = provisioner.getPollingStrategy(provisionerContext, taskInfo.getCluster());
}
return Math.max(0, subTaskPollingStrategy.nextPoll(subTaskExecNums++, subTaskStartTime));
}
// Otherwise, execute the next task immediately.
subTaskPollingStrategy = null;
subTaskStartTime = 0L;
subTaskExecNums = 0;
return 0;
} catch (InterruptedException e) {
throw e;
} catch (Exception e) {
LOG.error("{} task failed in {} state for program run {} due to {}.", currentTaskInfo.getProvisioningOp().getType(), state, programRunId, e.getMessage(), e);
handleSubtaskFailure(currentTaskInfo, e);
ProvisioningOp failureOp = new ProvisioningOp(currentTaskInfo.getProvisioningOp().getType(), ProvisioningOp.Status.FAILED);
ProvisioningTaskInfo failureInfo = new ProvisioningTaskInfo(currentTaskInfo, failureOp, currentTaskInfo.getCluster());
persistTaskInfo(failureInfo, retryStrategy);
LOG.debug("Terminated {} task for program run {} due to exception.", initialTaskInfo.getProvisioningOp().getType(), programRunId);
return -1L;
}
}
use of io.cdap.cdap.internal.provision.ProvisioningTaskInfo in project cdap by caskdata.
the class ProgramLifecycleService method issueStop.
/**
* Issues a command to stop the specified {@link RunId} of the specified {@link ProgramId} and returns a
* {@link ListenableFuture} with the {@link ProgramRunId} for the runs that were stopped.
* Clients can wait for completion of the {@link ListenableFuture}.
*
* @param programId the {@link ProgramId program} to issue a stop for
* @param runId the runId of the program run to stop. If null, all runs of the program as returned by
* {@link ProgramRuntimeService} are stopped.
* @return a list of {@link ListenableFuture} with the {@link ProgramRunId} that clients can wait on for stop
* to complete.
* @throws NotFoundException if the app, program or run was not found
* @throws BadRequestException if an attempt is made to stop a program that is either not running or
* was started by a workflow
* @throws UnauthorizedException if the user issuing the command is not authorized to stop the program. To stop a
* program, a user requires {@link ApplicationPermission#EXECUTE} permission on
* the program.
*/
public List<ListenableFuture<ProgramRunId>> issueStop(ProgramId programId, @Nullable String runId) throws Exception {
accessEnforcer.enforce(programId, authenticationContext.getPrincipal(), ApplicationPermission.EXECUTE);
// See if the program is running as per the runtime service
Map<RunId, RuntimeInfo> runtimeInfos = findRuntimeInfo(programId, runId);
Map<ProgramRunId, RunRecordDetail> activeRunRecords = getActiveRuns(programId, runId);
if (runtimeInfos.isEmpty() && activeRunRecords.isEmpty()) {
// Error out if no run information from runtime service and from run record
Store.ensureProgramExists(programId, store.getApplication(programId.getParent()));
throw new BadRequestException(String.format("Program '%s' is not running.", programId));
}
// Stop the running program based on a combination of runtime info and run record
// It's possible that some of them are not yet available from the runtimeService due to timing
// differences between the run record was created vs being added to runtimeService
// So we retry in a loop for up to 3 seconds max to cater for those cases
Set<String> pendingStops = Stream.concat(runtimeInfos.keySet().stream().map(RunId::getId), activeRunRecords.keySet().stream().map(ProgramRunId::getRun)).collect(Collectors.toSet());
List<ListenableFuture<ProgramRunId>> futures = new ArrayList<>();
Stopwatch stopwatch = new Stopwatch().start();
Set<ProgramRunId> cancelledProvisionRuns = new HashSet<>();
while (!pendingStops.isEmpty() && stopwatch.elapsedTime(TimeUnit.SECONDS) < 3L) {
Iterator<String> iterator = pendingStops.iterator();
while (iterator.hasNext()) {
ProgramRunId activeRunId = programId.run(iterator.next());
RunRecordDetail runRecord = activeRunRecords.get(activeRunId);
if (runRecord == null) {
runRecord = store.getRun(activeRunId);
}
// Check if the program is actually started from workflow and the workflow is running
if (runRecord != null && runRecord.getProperties().containsKey("workflowrunid") && runRecord.getStatus().equals(ProgramRunStatus.RUNNING)) {
String workflowRunId = runRecord.getProperties().get("workflowrunid");
throw new BadRequestException(String.format("Cannot stop the program '%s' started by the Workflow " + "run '%s'. Please stop the Workflow.", activeRunId, workflowRunId));
}
RuntimeInfo runtimeInfo = runtimeService.lookup(programId, RunIds.fromString(activeRunId.getRun()));
// if there is a runtimeInfo, the run is in the 'starting' state or later
if (runtimeInfo != null) {
ListenableFuture<ProgramController> future = runtimeInfo.getController().stop();
futures.add(Futures.transform(future, ProgramController::getProgramRunId));
iterator.remove();
// if it was in this set, it means we cancelled a task, but it had already sent a PROVISIONED message
// by the time we cancelled it. We then waited for it to show up in the runtime service and got here.
// We added a future for this run in the lines above, but we don't want to add another duplicate future
// at the end of this loop, so remove this run from the cancelled provision runs.
cancelledProvisionRuns.remove(activeRunId);
} else {
// if there is no runtimeInfo, the run could be in the provisioning state.
Optional<ProvisioningTaskInfo> cancelledInfo = provisioningService.cancelProvisionTask(activeRunId);
cancelledInfo.ifPresent(taskInfo -> {
cancelledProvisionRuns.add(activeRunId);
// This state check is to handle a race condition where we cancel the provision task, but not in time
// to prevent it from sending the PROVISIONED notification.
// If the notification was sent, but not yet consumed, we are *not* done stopping the run.
// We have to wait for the notification to be consumed, which will start the run, and place the controller
// in the runtimeService. The next time we loop, we can find it in the runtimeService and tell it to stop.
// If the notification was not sent, then we *are* done stopping the run.
// Therefore, if the state is CREATED, we don't remove it from the iterator so that the run will get
// checked again in the next loop, when we may get the controller from the runtimeService to stop it.
// No other task states have this race condition, as the PROVISIONED notification is only sent
// after the state transitions to CREATED. Therefore it is safe to remove the runId from the iterator,
// as we know we are done stopping it.
ProvisioningOp.Status taskState = taskInfo.getProvisioningOp().getStatus();
if (taskState != ProvisioningOp.Status.CREATED) {
iterator.remove();
}
});
}
}
if (!pendingStops.isEmpty()) {
// If not able to stop all of them, it means there were some runs that didn't have a runtime info and
// didn't have a provisioning task. This can happen if the run was already finished, or the run transitioned
// from the provisioning state to the starting state during this stop operation.
// We'll get the active runs again and filter it by the pending stops. Stop will be retried for those.
Set<String> finalPendingStops = pendingStops;
activeRunRecords = getActiveRuns(programId, runId).entrySet().stream().filter(e -> finalPendingStops.contains(e.getKey().getRun())).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
pendingStops = activeRunRecords.keySet().stream().map(ProgramRunId::getRun).collect(Collectors.toSet());
if (!pendingStops.isEmpty()) {
TimeUnit.MILLISECONDS.sleep(200);
}
}
}
for (ProgramRunId cancelledProvisionRun : cancelledProvisionRuns) {
SettableFuture<ProgramRunId> future = SettableFuture.create();
future.set(cancelledProvisionRun);
futures.add(future);
}
return futures;
}
use of io.cdap.cdap.internal.provision.ProvisioningTaskInfo in project cdap by caskdata.
the class ProvisioningSubtask method execute.
/**
* Executes the subtask and returns the next subtask that should be executed if there is one.
*
* @param taskInfo information about the task being executed, including the current cluster state
* @return task info to be sent to the next subtask if there is one
* @throws Exception if there was an error executing the subtask
*/
public Optional<ProvisioningTaskInfo> execute(ProvisioningTaskInfo taskInfo) throws Exception {
Cluster cluster = taskInfo.getCluster();
Cluster nextCluster = execute(cluster);
return transition.apply(nextCluster).map(nextState -> {
ProvisioningOp nextOp = new ProvisioningOp(taskInfo.getProvisioningOp().getType(), nextState);
return new ProvisioningTaskInfo(taskInfo, nextOp, nextCluster);
});
}
Aggregations