use of io.cdap.cdap.common.service.RetryStrategy in project cdap by caskdata.
the class RemoteExecutionTwillController method complete.
public void complete() {
terminateOnServiceStop = true;
executionService.stop();
try {
RetryStrategy retryStrategy = RetryStrategies.timeLimit(5, TimeUnit.SECONDS, RetryStrategies.exponentialDelay(500, 2000, TimeUnit.MILLISECONDS));
// Make sure the remote execution is completed
// Give 5 seconds for the remote process to shutdown. After 5 seconds, issues a kill.
long startTime = System.currentTimeMillis();
while (Retries.callWithRetries(remoteProcessController::isRunning, retryStrategy, Exception.class::isInstance)) {
if (System.currentTimeMillis() - startTime >= 5000) {
throw new IllegalStateException("Remote process for " + programRunId + " is still running");
}
TimeUnit.SECONDS.sleep(1);
}
} catch (Exception e) {
// If there is exception, use the remote execution controller to try killing the remote process
try {
LOG.debug("Force termination of remote process for program run {}", programRunId);
remoteProcessController.kill();
} catch (Exception ex) {
LOG.warn("Failed to terminate remote process for program run {}", programRunId, ex);
}
}
}
use of io.cdap.cdap.common.service.RetryStrategy in project cdap by caskdata.
the class ProvisioningTask method executeOnce.
/**
* Executes one iteration of subtask. It persists task info before each subtask such that this task
* can be re-created from the task info stored in the ProvisionerStore.
*/
@Override
public final long executeOnce() throws Exception {
RetryStrategy retryStrategy = getRetryStrategy();
Map<ProvisioningOp.Status, ProvisioningSubtask> subTasks = getSubTasks();
ProvisioningTaskInfo currentTaskInfo = persistTaskInfo(taskInfo, retryStrategy);
ProvisioningOp.Status state = currentTaskInfo.getProvisioningOp().getStatus();
if (state == ProvisioningOp.Status.CANCELLED) {
LOG.debug("Cancelled {} task for program run {}.", initialTaskInfo.getProvisioningOp().getType(), programRunId);
return -1L;
}
// Get the sub-task to execute
ProvisioningSubtask subtask = subTasks.get(state);
if (subtask == null) {
// should never happen
throw new IllegalStateException(String.format("Invalid state '%s' in provisioning task for program run '%s'. " + "This means there is a bug in provisioning state machine. " + "Please reach out to the development team.", state, programRunId));
}
if (subtask == EndSubtask.INSTANCE) {
LOG.debug("Completed {} task for program run {}.", initialTaskInfo.getProvisioningOp().getType(), programRunId);
return -1L;
}
if (subTaskStartTime == 0L) {
subTaskStartTime = System.currentTimeMillis();
}
try {
PROGRESS_LOG.debug("Executing {} subtask {} for program run {}.", currentTaskInfo.getProvisioningOp().getType(), state, programRunId);
taskInfo = Retries.callWithInterruptibleRetries(() -> subtask.execute(currentTaskInfo), retryStrategy, t -> t instanceof RetryableProvisionException).orElse(null);
PROGRESS_LOG.debug("Completed {} subtask {} for program run {}.", currentTaskInfo.getProvisioningOp().getType(), state, programRunId);
// Nothing more to execute
if (taskInfo == null) {
LOG.debug("No more {} task for program run {}.", initialTaskInfo.getProvisioningOp().getType(), programRunId);
return -1L;
}
ProvisioningOp.Status nextState = taskInfo.getProvisioningOp().getStatus();
// If state doesn't change, determine the delay based on the polling strategy
if (state == nextState) {
if (subTaskPollingStrategy == null) {
subTaskPollingStrategy = provisioner.getPollingStrategy(provisionerContext, taskInfo.getCluster());
}
return Math.max(0, subTaskPollingStrategy.nextPoll(subTaskExecNums++, subTaskStartTime));
}
// Otherwise, execute the next task immediately.
subTaskPollingStrategy = null;
subTaskStartTime = 0L;
subTaskExecNums = 0;
return 0;
} catch (InterruptedException e) {
throw e;
} catch (Exception e) {
LOG.error("{} task failed in {} state for program run {} due to {}.", currentTaskInfo.getProvisioningOp().getType(), state, programRunId, e.getMessage(), e);
handleSubtaskFailure(currentTaskInfo, e);
ProvisioningOp failureOp = new ProvisioningOp(currentTaskInfo.getProvisioningOp().getType(), ProvisioningOp.Status.FAILED);
ProvisioningTaskInfo failureInfo = new ProvisioningTaskInfo(currentTaskInfo, failureOp, currentTaskInfo.getCluster());
persistTaskInfo(failureInfo, retryStrategy);
LOG.debug("Terminated {} task for program run {} due to exception.", initialTaskInfo.getProvisioningOp().getType(), programRunId);
return -1L;
}
}
use of io.cdap.cdap.common.service.RetryStrategy in project cdap by caskdata.
the class ProgramNotificationSubscriberService method doStartUp.
@Override
protected void doStartUp() throws Exception {
super.doStartUp();
int batchSize = cConf.getInt(Constants.RuntimeMonitor.INIT_BATCH_SIZE);
RetryStrategy retryStrategy = RetryStrategies.fromConfiguration(cConf, "system.runtime.monitor.");
long startTs = System.currentTimeMillis();
Retries.runWithRetries(() -> store.scanActiveRuns(batchSize, (runRecordDetail) -> {
if (runRecordDetail.getStartTs() > startTs) {
return;
}
try {
if (runRecordDetail.getStatus() == ProgramRunStatus.PENDING) {
runRecordMonitorService.addRequest(runRecordDetail.getProgramRunId());
} else if (runRecordDetail.getStatus() == ProgramRunStatus.STARTING) {
runRecordMonitorService.addRequest(runRecordDetail.getProgramRunId());
// It is unknown what is the state of program runs in STARTING state.
// A STARTING message is published again to retry STARTING logic.
ProgramOptions programOptions = new SimpleProgramOptions(runRecordDetail.getProgramRunId().getParent(), new BasicArguments(runRecordDetail.getSystemArgs()), new BasicArguments(runRecordDetail.getUserArgs()));
LOG.debug("Retrying to start run {}.", runRecordDetail.getProgramRunId());
programStateWriter.start(runRecordDetail.getProgramRunId(), programOptions, null, this.store.loadProgram(runRecordDetail.getProgramRunId().getParent()));
}
} catch (Exception e) {
ProgramRunId programRunId = runRecordDetail.getProgramRunId();
LOG.warn("Retrying to start run {} failed. Marking it as failed.", programRunId, e);
programStateWriter.error(programRunId, e);
}
}), retryStrategy, e -> true);
}
use of io.cdap.cdap.common.service.RetryStrategy in project cdap by caskdata.
the class ServiceProgramRunner method run.
@Override
public ProgramController run(Program program, ProgramOptions options) {
int instanceId = Integer.parseInt(options.getArguments().getOption(ProgramOptionConstants.INSTANCE_ID, "-1"));
Preconditions.checkArgument(instanceId >= 0, "Missing instance Id");
int instanceCount = Integer.parseInt(options.getArguments().getOption(ProgramOptionConstants.INSTANCES, "0"));
Preconditions.checkArgument(instanceCount > 0, "Invalid or missing instance count");
RunId runId = ProgramRunners.getRunId(options);
ApplicationSpecification appSpec = program.getApplicationSpecification();
Preconditions.checkNotNull(appSpec, "Missing application specification.");
ProgramType programType = program.getType();
Preconditions.checkNotNull(programType, "Missing processor type.");
Preconditions.checkArgument(programType == ProgramType.SERVICE, "Only Service process type is supported.");
ServiceSpecification spec = appSpec.getServices().get(program.getName());
String host = options.getArguments().getOption(ProgramOptionConstants.HOST);
Preconditions.checkArgument(host != null, "No hostname is provided");
// Setup dataset framework context, if required
if (datasetFramework instanceof ProgramContextAware) {
ProgramId programId = program.getId();
((ProgramContextAware) datasetFramework).setContext(new BasicProgramContext(programId.run(runId)));
}
final PluginInstantiator pluginInstantiator = createPluginInstantiator(options, program.getClassLoader());
try {
RetryStrategy retryStrategy = SystemArguments.getRetryStrategy(options.getUserArguments().asMap(), program.getType(), cConf);
ArtifactManager artifactManager = artifactManagerFactory.create(program.getId().getNamespaceId(), retryStrategy);
ServiceHttpServer component = new ServiceHttpServer(host, program, options, cConf, spec, instanceId, instanceCount, serviceAnnouncer, metricsCollectionService, datasetFramework, txClient, discoveryServiceClient, pluginInstantiator, secureStore, secureStoreManager, messagingService, artifactManager, metadataReader, metadataPublisher, namespaceQueryAdmin, pluginFinder, fieldLineageWriter, transactionRunner, preferencesFetcher, remoteClientFactory, contextAccessEnforcer);
// Add a service listener to make sure the plugin instantiator is closed when the http server is finished.
component.addListener(createRuntimeServiceListener(Collections.singleton(pluginInstantiator)), Threads.SAME_THREAD_EXECUTOR);
ProgramController controller = new ServiceProgramControllerAdapter(component, program.getId().run(runId));
component.start();
return controller;
} catch (Throwable t) {
Closeables.closeQuietly(pluginInstantiator);
throw t;
}
}
use of io.cdap.cdap.common.service.RetryStrategy in project cdap by caskdata.
the class MainOutputCommitter method setupJob.
@Override
public void setupJob(JobContext jobContext) throws IOException {
Configuration configuration = jobContext.getConfiguration();
MapReduceClassLoader classLoader = MapReduceClassLoader.getFromConfiguration(configuration);
MapReduceTaskContextProvider taskContextProvider = classLoader.getTaskContextProvider();
Injector injector = taskContextProvider.getInjector();
cConf = injector.getInstance(CConfiguration.class);
MapReduceContextConfig contextConfig = new MapReduceContextConfig(jobContext.getConfiguration());
ProgramId programId = contextConfig.getProgramId();
LOG.info("Setting up for MapReduce job: namespaceId={}, applicationId={}, program={}, runid={}", programId.getNamespace(), programId.getApplication(), programId.getProgram(), ProgramRunners.getRunId(contextConfig.getProgramOptions()));
RetryStrategy retryStrategy = SystemArguments.getRetryStrategy(contextConfig.getProgramOptions().getUserArguments().asMap(), contextConfig.getProgramId().getType(), cConf);
this.txClient = new RetryingLongTransactionSystemClient(injector.getInstance(TransactionSystemClient.class), retryStrategy);
// We start long-running tx to be used by mapreduce job tasks when running on premise
if (ProgramRunners.getClusterMode(contextConfig.getProgramOptions()) == ClusterMode.ON_PREMISE) {
this.transaction = txClient.startLong();
// Write the tx somewhere, so that we can re-use it in mapreduce tasks
Path txFile = getTxFile(configuration, jobContext.getJobID());
FileSystem fs = txFile.getFileSystem(configuration);
try (FSDataOutputStream fsDataOutputStream = fs.create(txFile, false)) {
fsDataOutputStream.write(new TransactionCodec().encode(transaction));
}
}
// we can instantiate the TaskContext after we set the tx above. It's used by the operations below
taskContext = taskContextProvider.get(taskAttemptContext);
this.outputs = Outputs.transform(contextConfig.getOutputs(), taskContext);
super.setupJob(jobContext);
}
Aggregations