Search in sources :

Example 1 with RetryStrategy

use of io.cdap.cdap.common.service.RetryStrategy in project cdap by caskdata.

the class RemoteExecutionTwillController method complete.

public void complete() {
    terminateOnServiceStop = true;
    executionService.stop();
    try {
        RetryStrategy retryStrategy = RetryStrategies.timeLimit(5, TimeUnit.SECONDS, RetryStrategies.exponentialDelay(500, 2000, TimeUnit.MILLISECONDS));
        // Make sure the remote execution is completed
        // Give 5 seconds for the remote process to shutdown. After 5 seconds, issues a kill.
        long startTime = System.currentTimeMillis();
        while (Retries.callWithRetries(remoteProcessController::isRunning, retryStrategy, Exception.class::isInstance)) {
            if (System.currentTimeMillis() - startTime >= 5000) {
                throw new IllegalStateException("Remote process for " + programRunId + " is still running");
            }
            TimeUnit.SECONDS.sleep(1);
        }
    } catch (Exception e) {
        // If there is exception, use the remote execution controller to try killing the remote process
        try {
            LOG.debug("Force termination of remote process for program run {}", programRunId);
            remoteProcessController.kill();
        } catch (Exception ex) {
            LOG.warn("Failed to terminate remote process for program run {}", programRunId, ex);
        }
    }
}
Also used : RetryStrategy(io.cdap.cdap.common.service.RetryStrategy) TimeoutException(java.util.concurrent.TimeoutException) ExecutionException(java.util.concurrent.ExecutionException)

Example 2 with RetryStrategy

use of io.cdap.cdap.common.service.RetryStrategy in project cdap by caskdata.

the class ProvisioningTask method executeOnce.

/**
 * Executes one iteration of subtask. It persists task info before each subtask such that this task
 * can be re-created from the task info stored in the ProvisionerStore.
 */
@Override
public final long executeOnce() throws Exception {
    RetryStrategy retryStrategy = getRetryStrategy();
    Map<ProvisioningOp.Status, ProvisioningSubtask> subTasks = getSubTasks();
    ProvisioningTaskInfo currentTaskInfo = persistTaskInfo(taskInfo, retryStrategy);
    ProvisioningOp.Status state = currentTaskInfo.getProvisioningOp().getStatus();
    if (state == ProvisioningOp.Status.CANCELLED) {
        LOG.debug("Cancelled {} task for program run {}.", initialTaskInfo.getProvisioningOp().getType(), programRunId);
        return -1L;
    }
    // Get the sub-task to execute
    ProvisioningSubtask subtask = subTasks.get(state);
    if (subtask == null) {
        // should never happen
        throw new IllegalStateException(String.format("Invalid state '%s' in provisioning task for program run '%s'. " + "This means there is a bug in provisioning state machine. " + "Please reach out to the development team.", state, programRunId));
    }
    if (subtask == EndSubtask.INSTANCE) {
        LOG.debug("Completed {} task for program run {}.", initialTaskInfo.getProvisioningOp().getType(), programRunId);
        return -1L;
    }
    if (subTaskStartTime == 0L) {
        subTaskStartTime = System.currentTimeMillis();
    }
    try {
        PROGRESS_LOG.debug("Executing {} subtask {} for program run {}.", currentTaskInfo.getProvisioningOp().getType(), state, programRunId);
        taskInfo = Retries.callWithInterruptibleRetries(() -> subtask.execute(currentTaskInfo), retryStrategy, t -> t instanceof RetryableProvisionException).orElse(null);
        PROGRESS_LOG.debug("Completed {} subtask {} for program run {}.", currentTaskInfo.getProvisioningOp().getType(), state, programRunId);
        // Nothing more to execute
        if (taskInfo == null) {
            LOG.debug("No more {} task for program run {}.", initialTaskInfo.getProvisioningOp().getType(), programRunId);
            return -1L;
        }
        ProvisioningOp.Status nextState = taskInfo.getProvisioningOp().getStatus();
        // If state doesn't change, determine the delay based on the polling strategy
        if (state == nextState) {
            if (subTaskPollingStrategy == null) {
                subTaskPollingStrategy = provisioner.getPollingStrategy(provisionerContext, taskInfo.getCluster());
            }
            return Math.max(0, subTaskPollingStrategy.nextPoll(subTaskExecNums++, subTaskStartTime));
        }
        // Otherwise, execute the next task immediately.
        subTaskPollingStrategy = null;
        subTaskStartTime = 0L;
        subTaskExecNums = 0;
        return 0;
    } catch (InterruptedException e) {
        throw e;
    } catch (Exception e) {
        LOG.error("{} task failed in {} state for program run {} due to {}.", currentTaskInfo.getProvisioningOp().getType(), state, programRunId, e.getMessage(), e);
        handleSubtaskFailure(currentTaskInfo, e);
        ProvisioningOp failureOp = new ProvisioningOp(currentTaskInfo.getProvisioningOp().getType(), ProvisioningOp.Status.FAILED);
        ProvisioningTaskInfo failureInfo = new ProvisioningTaskInfo(currentTaskInfo, failureOp, currentTaskInfo.getCluster());
        persistTaskInfo(failureInfo, retryStrategy);
        LOG.debug("Terminated {} task for program run {} due to exception.", initialTaskInfo.getProvisioningOp().getType(), programRunId);
        return -1L;
    }
}
Also used : RetryableProvisionException(io.cdap.cdap.runtime.spi.provisioner.RetryableProvisionException) ProvisioningTaskInfo(io.cdap.cdap.internal.provision.ProvisioningTaskInfo) ProvisioningOp(io.cdap.cdap.internal.provision.ProvisioningOp) RetryStrategy(io.cdap.cdap.common.service.RetryStrategy) RetryableProvisionException(io.cdap.cdap.runtime.spi.provisioner.RetryableProvisionException)

Example 3 with RetryStrategy

use of io.cdap.cdap.common.service.RetryStrategy in project cdap by caskdata.

the class ProgramNotificationSubscriberService method doStartUp.

@Override
protected void doStartUp() throws Exception {
    super.doStartUp();
    int batchSize = cConf.getInt(Constants.RuntimeMonitor.INIT_BATCH_SIZE);
    RetryStrategy retryStrategy = RetryStrategies.fromConfiguration(cConf, "system.runtime.monitor.");
    long startTs = System.currentTimeMillis();
    Retries.runWithRetries(() -> store.scanActiveRuns(batchSize, (runRecordDetail) -> {
        if (runRecordDetail.getStartTs() > startTs) {
            return;
        }
        try {
            if (runRecordDetail.getStatus() == ProgramRunStatus.PENDING) {
                runRecordMonitorService.addRequest(runRecordDetail.getProgramRunId());
            } else if (runRecordDetail.getStatus() == ProgramRunStatus.STARTING) {
                runRecordMonitorService.addRequest(runRecordDetail.getProgramRunId());
                // It is unknown what is the state of program runs in STARTING state.
                // A STARTING message is published again to retry STARTING logic.
                ProgramOptions programOptions = new SimpleProgramOptions(runRecordDetail.getProgramRunId().getParent(), new BasicArguments(runRecordDetail.getSystemArgs()), new BasicArguments(runRecordDetail.getUserArgs()));
                LOG.debug("Retrying to start run {}.", runRecordDetail.getProgramRunId());
                programStateWriter.start(runRecordDetail.getProgramRunId(), programOptions, null, this.store.loadProgram(runRecordDetail.getProgramRunId().getParent()));
            }
        } catch (Exception e) {
            ProgramRunId programRunId = runRecordDetail.getProgramRunId();
            LOG.warn("Retrying to start run {} failed. Marking it as failed.", programRunId, e);
            programStateWriter.error(programRunId, e);
        }
    }), retryStrategy, e -> true);
}
Also used : RunRecordDetail(io.cdap.cdap.internal.app.store.RunRecordDetail) ProvisionRequest(io.cdap.cdap.internal.provision.ProvisionRequest) ProvisionerNotifier(io.cdap.cdap.internal.provision.ProvisionerNotifier) TypeToken(com.google.gson.reflect.TypeToken) ImmutablePair(io.cdap.cdap.common.utils.ImmutablePair) NamespaceId(io.cdap.cdap.proto.id.NamespaceId) Notification(io.cdap.cdap.proto.Notification) Inject(com.google.inject.Inject) LoggerFactory(org.slf4j.LoggerFactory) RetryStrategies(io.cdap.cdap.common.service.RetryStrategies) GsonBuilder(com.google.gson.GsonBuilder) WorkflowNode(io.cdap.cdap.api.workflow.WorkflowNode) ProgramStateWriter(io.cdap.cdap.app.runtime.ProgramStateWriter) SimpleProgramOptions(io.cdap.cdap.internal.app.runtime.SimpleProgramOptions) Gson(com.google.gson.Gson) Map(java.util.Map) BasicThrowable(io.cdap.cdap.proto.BasicThrowable) ImmutableMap(com.google.common.collect.ImmutableMap) MessagingService(io.cdap.cdap.messaging.MessagingService) Set(java.util.Set) ProgramRunStatus(io.cdap.cdap.proto.ProgramRunStatus) StructuredTableContext(io.cdap.cdap.spi.data.StructuredTableContext) SchedulableProgramType(io.cdap.cdap.api.schedule.SchedulableProgramType) StandardCharsets(java.nio.charset.StandardCharsets) ApplicationSpecificationAdapter(io.cdap.cdap.internal.app.ApplicationSpecificationAdapter) ScheduleProgramInfo(io.cdap.cdap.api.workflow.ScheduleProgramInfo) MetricsContext(io.cdap.cdap.api.metrics.MetricsContext) List(java.util.List) SecurityRequestContext(io.cdap.cdap.security.spi.authentication.SecurityRequestContext) Type(java.lang.reflect.Type) TransactionRunner(io.cdap.cdap.spi.data.transaction.TransactionRunner) Optional(java.util.Optional) Constants(io.cdap.cdap.common.conf.Constants) ProfileId(io.cdap.cdap.proto.id.ProfileId) Queue(java.util.Queue) ProgramOptionConstants(io.cdap.cdap.internal.app.runtime.ProgramOptionConstants) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) RunRecordDetailWithExistingStatus(io.cdap.cdap.internal.app.store.RunRecordDetailWithExistingStatus) ProgramRunners(io.cdap.cdap.internal.app.runtime.ProgramRunners) Retries(io.cdap.cdap.common.service.Retries) RetryStrategy(io.cdap.cdap.common.service.RetryStrategy) Cluster(io.cdap.cdap.runtime.spi.provisioner.Cluster) WorkflowSpecification(io.cdap.cdap.api.workflow.WorkflowSpecification) HashMap(java.util.HashMap) ProgramType(io.cdap.cdap.proto.ProgramType) ArrayList(java.util.ArrayList) LinkedHashMap(java.util.LinkedHashMap) ProvisioningService(io.cdap.cdap.internal.provision.ProvisioningService) ProgramRunId(io.cdap.cdap.proto.id.ProgramRunId) ProgramHeartbeatTable(io.cdap.cdap.reporting.ProgramHeartbeatTable) ProgramOptions(io.cdap.cdap.app.runtime.ProgramOptions) SystemArguments(io.cdap.cdap.internal.app.runtime.SystemArguments) WorkflowActionNode(io.cdap.cdap.api.workflow.WorkflowActionNode) LinkedList(java.util.LinkedList) Nullable(javax.annotation.Nullable) AppMetadataStore(io.cdap.cdap.internal.app.store.AppMetadataStore) Logger(org.slf4j.Logger) Iterator(java.util.Iterator) JsonSyntaxException(com.google.gson.JsonSyntaxException) RunIds(io.cdap.cdap.common.app.RunIds) ProgramId(io.cdap.cdap.proto.id.ProgramId) ProgramDescriptor(io.cdap.cdap.app.program.ProgramDescriptor) IOException(java.io.IOException) ProjectInfo(io.cdap.cdap.common.utils.ProjectInfo) ProgramRunClusterStatus(io.cdap.cdap.proto.ProgramRunClusterStatus) TableNotFoundException(io.cdap.cdap.spi.data.TableNotFoundException) MetricsCollectionService(io.cdap.cdap.api.metrics.MetricsCollectionService) Store(io.cdap.cdap.app.store.Store) TimeUnit(java.util.concurrent.TimeUnit) CConfiguration(io.cdap.cdap.common.conf.CConfiguration) Collections(java.util.Collections) BasicArguments(io.cdap.cdap.internal.app.runtime.BasicArguments) SimpleProgramOptions(io.cdap.cdap.internal.app.runtime.SimpleProgramOptions) BasicArguments(io.cdap.cdap.internal.app.runtime.BasicArguments) ProgramRunId(io.cdap.cdap.proto.id.ProgramRunId) RetryStrategy(io.cdap.cdap.common.service.RetryStrategy) SimpleProgramOptions(io.cdap.cdap.internal.app.runtime.SimpleProgramOptions) ProgramOptions(io.cdap.cdap.app.runtime.ProgramOptions) JsonSyntaxException(com.google.gson.JsonSyntaxException) IOException(java.io.IOException) TableNotFoundException(io.cdap.cdap.spi.data.TableNotFoundException)

Example 4 with RetryStrategy

use of io.cdap.cdap.common.service.RetryStrategy in project cdap by caskdata.

the class ServiceProgramRunner method run.

@Override
public ProgramController run(Program program, ProgramOptions options) {
    int instanceId = Integer.parseInt(options.getArguments().getOption(ProgramOptionConstants.INSTANCE_ID, "-1"));
    Preconditions.checkArgument(instanceId >= 0, "Missing instance Id");
    int instanceCount = Integer.parseInt(options.getArguments().getOption(ProgramOptionConstants.INSTANCES, "0"));
    Preconditions.checkArgument(instanceCount > 0, "Invalid or missing instance count");
    RunId runId = ProgramRunners.getRunId(options);
    ApplicationSpecification appSpec = program.getApplicationSpecification();
    Preconditions.checkNotNull(appSpec, "Missing application specification.");
    ProgramType programType = program.getType();
    Preconditions.checkNotNull(programType, "Missing processor type.");
    Preconditions.checkArgument(programType == ProgramType.SERVICE, "Only Service process type is supported.");
    ServiceSpecification spec = appSpec.getServices().get(program.getName());
    String host = options.getArguments().getOption(ProgramOptionConstants.HOST);
    Preconditions.checkArgument(host != null, "No hostname is provided");
    // Setup dataset framework context, if required
    if (datasetFramework instanceof ProgramContextAware) {
        ProgramId programId = program.getId();
        ((ProgramContextAware) datasetFramework).setContext(new BasicProgramContext(programId.run(runId)));
    }
    final PluginInstantiator pluginInstantiator = createPluginInstantiator(options, program.getClassLoader());
    try {
        RetryStrategy retryStrategy = SystemArguments.getRetryStrategy(options.getUserArguments().asMap(), program.getType(), cConf);
        ArtifactManager artifactManager = artifactManagerFactory.create(program.getId().getNamespaceId(), retryStrategy);
        ServiceHttpServer component = new ServiceHttpServer(host, program, options, cConf, spec, instanceId, instanceCount, serviceAnnouncer, metricsCollectionService, datasetFramework, txClient, discoveryServiceClient, pluginInstantiator, secureStore, secureStoreManager, messagingService, artifactManager, metadataReader, metadataPublisher, namespaceQueryAdmin, pluginFinder, fieldLineageWriter, transactionRunner, preferencesFetcher, remoteClientFactory, contextAccessEnforcer);
        // Add a service listener to make sure the plugin instantiator is closed when the http server is finished.
        component.addListener(createRuntimeServiceListener(Collections.singleton(pluginInstantiator)), Threads.SAME_THREAD_EXECUTOR);
        ProgramController controller = new ServiceProgramControllerAdapter(component, program.getId().run(runId));
        component.start();
        return controller;
    } catch (Throwable t) {
        Closeables.closeQuietly(pluginInstantiator);
        throw t;
    }
}
Also used : ApplicationSpecification(io.cdap.cdap.api.app.ApplicationSpecification) ServiceSpecification(io.cdap.cdap.api.service.ServiceSpecification) ProgramController(io.cdap.cdap.app.runtime.ProgramController) ProgramId(io.cdap.cdap.proto.id.ProgramId) BasicProgramContext(io.cdap.cdap.internal.app.runtime.BasicProgramContext) ServiceHttpServer(io.cdap.cdap.internal.app.services.ServiceHttpServer) ArtifactManager(io.cdap.cdap.api.artifact.ArtifactManager) PluginInstantiator(io.cdap.cdap.internal.app.runtime.plugin.PluginInstantiator) ProgramType(io.cdap.cdap.proto.ProgramType) ProgramRunId(io.cdap.cdap.proto.id.ProgramRunId) RunId(org.apache.twill.api.RunId) ProgramContextAware(io.cdap.cdap.data.ProgramContextAware) RetryStrategy(io.cdap.cdap.common.service.RetryStrategy)

Example 5 with RetryStrategy

use of io.cdap.cdap.common.service.RetryStrategy in project cdap by caskdata.

the class MainOutputCommitter method setupJob.

@Override
public void setupJob(JobContext jobContext) throws IOException {
    Configuration configuration = jobContext.getConfiguration();
    MapReduceClassLoader classLoader = MapReduceClassLoader.getFromConfiguration(configuration);
    MapReduceTaskContextProvider taskContextProvider = classLoader.getTaskContextProvider();
    Injector injector = taskContextProvider.getInjector();
    cConf = injector.getInstance(CConfiguration.class);
    MapReduceContextConfig contextConfig = new MapReduceContextConfig(jobContext.getConfiguration());
    ProgramId programId = contextConfig.getProgramId();
    LOG.info("Setting up for MapReduce job: namespaceId={}, applicationId={}, program={}, runid={}", programId.getNamespace(), programId.getApplication(), programId.getProgram(), ProgramRunners.getRunId(contextConfig.getProgramOptions()));
    RetryStrategy retryStrategy = SystemArguments.getRetryStrategy(contextConfig.getProgramOptions().getUserArguments().asMap(), contextConfig.getProgramId().getType(), cConf);
    this.txClient = new RetryingLongTransactionSystemClient(injector.getInstance(TransactionSystemClient.class), retryStrategy);
    // We start long-running tx to be used by mapreduce job tasks when running on premise
    if (ProgramRunners.getClusterMode(contextConfig.getProgramOptions()) == ClusterMode.ON_PREMISE) {
        this.transaction = txClient.startLong();
        // Write the tx somewhere, so that we can re-use it in mapreduce tasks
        Path txFile = getTxFile(configuration, jobContext.getJobID());
        FileSystem fs = txFile.getFileSystem(configuration);
        try (FSDataOutputStream fsDataOutputStream = fs.create(txFile, false)) {
            fsDataOutputStream.write(new TransactionCodec().encode(transaction));
        }
    }
    // we can instantiate the TaskContext after we set the tx above. It's used by the operations below
    taskContext = taskContextProvider.get(taskAttemptContext);
    this.outputs = Outputs.transform(contextConfig.getOutputs(), taskContext);
    super.setupJob(jobContext);
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) CConfiguration(io.cdap.cdap.common.conf.CConfiguration) ProgramId(io.cdap.cdap.proto.id.ProgramId) CConfiguration(io.cdap.cdap.common.conf.CConfiguration) RetryingLongTransactionSystemClient(io.cdap.cdap.data2.transaction.RetryingLongTransactionSystemClient) Injector(com.google.inject.Injector) TransactionCodec(org.apache.tephra.TransactionCodec) FileSystem(org.apache.hadoop.fs.FileSystem) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) RetryStrategy(io.cdap.cdap.common.service.RetryStrategy)

Aggregations

RetryStrategy (io.cdap.cdap.common.service.RetryStrategy)13 CConfiguration (io.cdap.cdap.common.conf.CConfiguration)9 Retries (io.cdap.cdap.common.service.Retries)4 ProgramId (io.cdap.cdap.proto.id.ProgramId)4 IOException (java.io.IOException)4 Gson (com.google.gson.Gson)3 Inject (com.google.inject.Inject)3 RetryableException (io.cdap.cdap.api.retry.RetryableException)3 Constants (io.cdap.cdap.common.conf.Constants)3 RetryStrategies (io.cdap.cdap.common.service.RetryStrategies)3 Service (com.google.common.util.concurrent.Service)2 AppenderContext (io.cdap.cdap.api.logging.AppenderContext)2 ProgramOptions (io.cdap.cdap.app.runtime.ProgramOptions)2 ProgramStateWriter (io.cdap.cdap.app.runtime.ProgramStateWriter)2 ClientConfig (io.cdap.cdap.client.config.ClientConfig)2 ConnectionConfig (io.cdap.cdap.client.config.ConnectionConfig)2 NotFoundException (io.cdap.cdap.common.NotFoundException)2 RetryOnStartFailureService (io.cdap.cdap.common.service.RetryOnStartFailureService)2 LogPipelineLoader (io.cdap.cdap.logging.framework.LogPipelineLoader)2 LogPipelineSpecification (io.cdap.cdap.logging.framework.LogPipelineSpecification)2