Search in sources :

Example 1 with EXPORTED_SNAPSHOTS_PREFIX

use of com.hazelcast.jet.impl.JobRepository.EXPORTED_SNAPSHOTS_PREFIX in project hazelcast by hazelcast.

the class MasterJobContext method tryStartJob.

/**
 * Starts the execution of the job if it is not already completed,
 * cancelled or failed.
 * <p>
 * If the job is already cancelled, triggers the job completion procedure.
 * <p>
 * If the job quorum is not satisfied, reschedules the job restart.
 * <p>
 * If there was a membership change and the partition table is not completely
 * fixed yet, reschedules the job restart.
 */
void tryStartJob(Supplier<Long> executionIdSupplier) {
    mc.coordinationService().submitToCoordinatorThread(() -> {
        executionStartTime = System.currentTimeMillis();
        try {
            JobExecutionRecord jobExecRec = mc.jobExecutionRecord();
            jobExecRec.markExecuted();
            Tuple2<DAG, ClassLoader> dagAndClassloader = resolveDagAndCL(executionIdSupplier);
            if (dagAndClassloader == null) {
                return;
            }
            DAG dag = dagAndClassloader.f0();
            assert dag != null;
            ClassLoader classLoader = dagAndClassloader.f1();
            // must call this before rewriteDagWithSnapshotRestore()
            String dotRepresentation = dag.toDotString(defaultParallelism, defaultQueueSize);
            long snapshotId = jobExecRec.snapshotId();
            String snapshotName = mc.jobConfig().getInitialSnapshotName();
            String mapName = snapshotId >= 0 ? jobExecRec.successfulSnapshotDataMapName(mc.jobId()) : snapshotName != null ? EXPORTED_SNAPSHOTS_PREFIX + snapshotName : null;
            if (mapName != null) {
                rewriteDagWithSnapshotRestore(dag, snapshotId, mapName, snapshotName);
            } else {
                logger.info("Didn't find any snapshot to restore for " + mc.jobIdString());
            }
            MembersView membersView = Util.getMembersView(mc.nodeEngine());
            logger.info("Start executing " + mc.jobIdString() + ", execution graph in DOT format:\n" + dotRepresentation + "\nHINT: You can use graphviz or http://viz-js.com to visualize the printed graph.");
            logger.fine("Building execution plan for " + mc.jobIdString());
            Util.doWithClassLoader(classLoader, () -> mc.setExecutionPlanMap(createExecutionPlans(mc.nodeEngine(), membersView.getMembers(), dag, mc.jobId(), mc.executionId(), mc.jobConfig(), jobExecRec.ongoingSnapshotId(), false, mc.jobRecord().getSubject())));
            logger.fine("Built execution plans for " + mc.jobIdString());
            Set<MemberInfo> participants = mc.executionPlanMap().keySet();
            Version coordinatorVersion = mc.nodeEngine().getLocalMember().getVersion().asVersion();
            Function<ExecutionPlan, Operation> operationCtor = plan -> new InitExecutionOperation(mc.jobId(), mc.executionId(), membersView.getVersion(), coordinatorVersion, participants, mc.nodeEngine().getSerializationService().toData(plan), false);
            mc.invokeOnParticipants(operationCtor, this::onInitStepCompleted, null, false);
        } catch (Throwable e) {
            finalizeJob(e);
        }
    });
}
Also used : Address(com.hazelcast.cluster.Address) SUSPEND(com.hazelcast.jet.impl.TerminationMode.ActionAfterTerminate.SUSPEND) NOT_RUNNING(com.hazelcast.jet.core.JobStatus.NOT_RUNNING) GetLocalJobMetricsOperation(com.hazelcast.jet.impl.operation.GetLocalJobMetricsOperation) CompletableFuture.completedFuture(java.util.concurrent.CompletableFuture.completedFuture) NonCompletableFuture(com.hazelcast.jet.impl.util.NonCompletableFuture) ExceptionUtil.isTopologyException(com.hazelcast.jet.impl.util.ExceptionUtil.isTopologyException) JobTerminateRequestedException(com.hazelcast.jet.impl.exception.JobTerminateRequestedException) SourceProcessors.readMapP(com.hazelcast.jet.core.processor.SourceProcessors.readMapP) RESTART(com.hazelcast.jet.impl.TerminationMode.ActionAfterTerminate.RESTART) JetDelegatingClassLoader(com.hazelcast.jet.impl.deployment.JetDelegatingClassLoader) TerminatedWithSnapshotException(com.hazelcast.jet.impl.exception.TerminatedWithSnapshotException) Collectors.toMap(java.util.stream.Collectors.toMap) Functions.entryKey(com.hazelcast.function.Functions.entryKey) MemberInfo(com.hazelcast.internal.cluster.MemberInfo) Map(java.util.Map) STARTING(com.hazelcast.jet.core.JobStatus.STARTING) SUSPENDED(com.hazelcast.jet.core.JobStatus.SUSPENDED) DAG(com.hazelcast.jet.core.DAG) JobStatus(com.hazelcast.jet.core.JobStatus) ExceptionUtil(com.hazelcast.jet.impl.util.ExceptionUtil) JobMetrics(com.hazelcast.jet.core.metrics.JobMetrics) CancellationException(java.util.concurrent.CancellationException) CANCEL_GRACEFUL(com.hazelcast.jet.impl.TerminationMode.CANCEL_GRACEFUL) Collections.emptyList(java.util.Collections.emptyList) Collection(java.util.Collection) Set(java.util.Set) UUID(java.util.UUID) MILLISECONDS(java.util.concurrent.TimeUnit.MILLISECONDS) Collectors(java.util.stream.Collectors) CANCEL_FORCEFUL(com.hazelcast.jet.impl.TerminationMode.CANCEL_FORCEFUL) Objects(java.util.Objects) Util(com.hazelcast.jet.impl.util.Util) List(java.util.List) Util.idToString(com.hazelcast.jet.Util.idToString) ExecutionPlan(com.hazelcast.jet.impl.execution.init.ExecutionPlan) MetricNames(com.hazelcast.jet.core.metrics.MetricNames) Entry(java.util.Map.Entry) TopologyChangedException(com.hazelcast.jet.core.TopologyChangedException) COMPLETED(com.hazelcast.jet.core.JobStatus.COMPLETED) JetDisabledException(com.hazelcast.jet.impl.exception.JetDisabledException) LoggingUtil(com.hazelcast.jet.impl.util.LoggingUtil) ExecutionPlanBuilder.createExecutionPlans(com.hazelcast.jet.impl.execution.init.ExecutionPlanBuilder.createExecutionPlans) Collectors.partitioningBy(java.util.stream.Collectors.partitioningBy) TerminateExecutionOperation(com.hazelcast.jet.impl.operation.TerminateExecutionOperation) ExceptionUtil.isRestartableException(com.hazelcast.jet.impl.util.ExceptionUtil.isRestartableException) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) LoggingUtil.logFinest(com.hazelcast.jet.impl.util.LoggingUtil.logFinest) Util.doWithClassLoader(com.hazelcast.jet.impl.util.Util.doWithClassLoader) HashMap(java.util.HashMap) CompletableFuture(java.util.concurrent.CompletableFuture) ExecutionService(com.hazelcast.spi.impl.executionservice.ExecutionService) StartExecutionOperation(com.hazelcast.jet.impl.operation.StartExecutionOperation) Function(java.util.function.Function) Supplier(java.util.function.Supplier) Util.formatJobDuration(com.hazelcast.jet.impl.util.Util.formatJobDuration) ActionAfterTerminate(com.hazelcast.jet.impl.TerminationMode.ActionAfterTerminate) ExecutionNotFoundException(com.hazelcast.jet.impl.exception.ExecutionNotFoundException) ArrayList(java.util.ArrayList) JetException(com.hazelcast.jet.JetException) HashSet(java.util.HashSet) InitExecutionOperation(com.hazelcast.jet.impl.operation.InitExecutionOperation) COORDINATOR(com.hazelcast.jet.impl.JobClassLoaderService.JobPhase.COORDINATOR) ILogger(com.hazelcast.logging.ILogger) SnapshotValidator.validateSnapshot(com.hazelcast.jet.impl.SnapshotValidator.validateSnapshot) ExceptionUtil.rethrow(com.hazelcast.jet.impl.util.ExceptionUtil.rethrow) Operation(com.hazelcast.spi.impl.operationservice.Operation) Util.entry(com.hazelcast.jet.Util.entry) ExceptionUtil.withTryCatch(com.hazelcast.jet.impl.util.ExceptionUtil.withTryCatch) BiConsumer(java.util.function.BiConsumer) MembersView(com.hazelcast.internal.cluster.impl.MembersView) LocalMemberResetException(com.hazelcast.core.LocalMemberResetException) RESTART_GRACEFUL(com.hazelcast.jet.impl.TerminationMode.RESTART_GRACEFUL) Edge(com.hazelcast.jet.core.Edge) Version(com.hazelcast.version.Version) EXPORTED_SNAPSHOTS_PREFIX(com.hazelcast.jet.impl.JobRepository.EXPORTED_SNAPSHOTS_PREFIX) Nonnull(javax.annotation.Nonnull) Tuple2(com.hazelcast.jet.datamodel.Tuple2) Nullable(javax.annotation.Nullable) Job(com.hazelcast.jet.Job) Measurement(com.hazelcast.jet.core.metrics.Measurement) SUSPENDED_EXPORTING_SNAPSHOT(com.hazelcast.jet.core.JobStatus.SUSPENDED_EXPORTING_SNAPSHOT) Util.toList(com.hazelcast.jet.impl.util.Util.toList) RawJobMetrics(com.hazelcast.jet.impl.metrics.RawJobMetrics) MetricTags(com.hazelcast.jet.core.metrics.MetricTags) NONE(com.hazelcast.jet.config.ProcessingGuarantee.NONE) Consumer(java.util.function.Consumer) Vertex(com.hazelcast.jet.core.Vertex) Tuple2.tuple2(com.hazelcast.jet.datamodel.Tuple2.tuple2) CustomClassLoadedObject.deserializeWithCustomClassLoader(com.hazelcast.jet.impl.execution.init.CustomClassLoadedObject.deserializeWithCustomClassLoader) ExceptionUtil.peel(com.hazelcast.jet.impl.util.ExceptionUtil.peel) FAILED(com.hazelcast.jet.core.JobStatus.FAILED) RUNNING(com.hazelcast.jet.core.JobStatus.RUNNING) Collections(java.util.Collections) IMap(com.hazelcast.map.IMap) Edge.between(com.hazelcast.jet.core.Edge.between) MembersView(com.hazelcast.internal.cluster.impl.MembersView) DAG(com.hazelcast.jet.core.DAG) Util.idToString(com.hazelcast.jet.Util.idToString) GetLocalJobMetricsOperation(com.hazelcast.jet.impl.operation.GetLocalJobMetricsOperation) TerminateExecutionOperation(com.hazelcast.jet.impl.operation.TerminateExecutionOperation) StartExecutionOperation(com.hazelcast.jet.impl.operation.StartExecutionOperation) InitExecutionOperation(com.hazelcast.jet.impl.operation.InitExecutionOperation) Operation(com.hazelcast.spi.impl.operationservice.Operation) ExecutionPlan(com.hazelcast.jet.impl.execution.init.ExecutionPlan) MemberInfo(com.hazelcast.internal.cluster.MemberInfo) Version(com.hazelcast.version.Version) InitExecutionOperation(com.hazelcast.jet.impl.operation.InitExecutionOperation) JetDelegatingClassLoader(com.hazelcast.jet.impl.deployment.JetDelegatingClassLoader) Util.doWithClassLoader(com.hazelcast.jet.impl.util.Util.doWithClassLoader) CustomClassLoadedObject.deserializeWithCustomClassLoader(com.hazelcast.jet.impl.execution.init.CustomClassLoadedObject.deserializeWithCustomClassLoader)

Example 2 with EXPORTED_SNAPSHOTS_PREFIX

use of com.hazelcast.jet.impl.JobRepository.EXPORTED_SNAPSHOTS_PREFIX in project hazelcast by hazelcast.

the class MasterSnapshotContext method onSnapshotPhase1CompleteWithStartResponses.

private void onSnapshotPhase1CompleteWithStartResponses(Collection<Entry<MemberInfo, Object>> responses, long executionId, long snapshotId, String snapshotMapName, int snapshotFlags, @Nullable CompletableFuture<Void> future, SnapshotPhase1Result mergedResult, List<CompletableFuture<Void>> missingResponses) {
    mc.coordinationService().submitToCoordinatorThread(() -> {
        mc.lock();
        boolean isSuccess;
        SnapshotStats stats;
        try {
            if (!missingResponses.isEmpty()) {
                LoggingUtil.logFine(logger, "%s all awaited responses to StartExecutionOperation received or " + "were already received", mc.jobIdString());
            }
            // Check the execution ID to check if a new execution didn't start yet.
            if (executionId != mc.executionId()) {
                LoggingUtil.logFine(logger, "%s: ignoring responses for snapshot %s phase 1: " + "the responses are from a different execution: %s. Responses: %s", mc.jobIdString(), snapshotId, idToString(executionId), responses);
                // a new execution started, ignore this response.
                return;
            }
            for (CompletableFuture<Void> response : missingResponses) {
                assert response.isDone() : "response not done";
                try {
                    response.get();
                } catch (ExecutionException e) {
                    mergedResult.merge(new SnapshotPhase1Result(0, 0, 0, e.getCause()));
                } catch (InterruptedException e) {
                    Thread.currentThread().interrupt();
                }
            }
            IMap<Object, Object> snapshotMap = mc.nodeEngine().getHazelcastInstance().getMap(snapshotMapName);
            try {
                SnapshotValidationRecord validationRecord = new SnapshotValidationRecord(snapshotId, mergedResult.getNumChunks(), mergedResult.getNumBytes(), mc.jobExecutionRecord().ongoingSnapshotStartTime(), mc.jobId(), mc.jobName(), mc.jobRecord().getDagJson());
                // The decision moment for exported snapshots: after this the snapshot is valid to be restored
                // from, however it will be not listed by JetInstance.getJobStateSnapshots unless the validation
                // record is inserted into the cache below
                Object oldValue = snapshotMap.put(SnapshotValidationRecord.KEY, validationRecord);
                if (snapshotMapName.startsWith(EXPORTED_SNAPSHOTS_PREFIX)) {
                    String snapshotName = snapshotMapName.substring(EXPORTED_SNAPSHOTS_PREFIX.length());
                    mc.jobRepository().cacheValidationRecord(snapshotName, validationRecord);
                }
                if (oldValue != null) {
                    logger.severe("SnapshotValidationRecord overwritten after writing to '" + snapshotMapName + "' for " + mc.jobIdString() + ": snapshot data might be corrupted");
                }
            } catch (Exception e) {
                mergedResult.merge(new SnapshotPhase1Result(0, 0, 0, e));
            }
            isSuccess = mergedResult.getError() == null;
            stats = mc.jobExecutionRecord().ongoingSnapshotDone(mergedResult.getNumBytes(), mergedResult.getNumKeys(), mergedResult.getNumChunks(), mergedResult.getError());
            // the decision moment for regular snapshots: after this the snapshot is ready to be restored from
            mc.writeJobExecutionRecord(false);
            if (logger.isFineEnabled()) {
                logger.fine(String.format("Snapshot %d phase 1 for %s completed with status %s in %dms, " + "%,d bytes, %,d keys in %,d chunks, stored in '%s', proceeding to phase 2", snapshotId, mc.jobIdString(), isSuccess ? "SUCCESS" : "FAILURE", stats.duration(), stats.numBytes(), stats.numKeys(), stats.numChunks(), snapshotMapName));
            }
            if (!isSuccess) {
                logger.warning(mc.jobIdString() + " snapshot " + snapshotId + " phase 1 failed on some " + "member(s), one of the failures: " + mergedResult.getError());
                try {
                    snapshotMap.clear();
                } catch (Exception e) {
                    logger.warning(mc.jobIdString() + ": failed to clear snapshot map '" + snapshotMapName + "' after a failure", e);
                }
            }
            if (!SnapshotFlags.isExport(snapshotFlags)) {
                mc.jobRepository().clearSnapshotData(mc.jobId(), mc.jobExecutionRecord().ongoingDataMapIndex());
            }
        } finally {
            mc.unlock();
        }
        // start the phase 2
        Function<ExecutionPlan, Operation> factory = plan -> new SnapshotPhase2Operation(mc.jobId(), executionId, snapshotId, isSuccess && !SnapshotFlags.isExportOnly(snapshotFlags));
        mc.invokeOnParticipants(factory, responses2 -> onSnapshotPhase2Complete(mergedResult.getError(), responses2, executionId, snapshotId, snapshotFlags, future, stats.startTime()), null, true);
    });
}
Also used : SnapshotPhase2Operation(com.hazelcast.jet.impl.operation.SnapshotPhase2Operation) LoggingUtil(com.hazelcast.jet.impl.util.LoggingUtil) CompletableFuture.completedFuture(java.util.concurrent.CompletableFuture.completedFuture) CompletableFuture(java.util.concurrent.CompletableFuture) Function(java.util.function.Function) ExecutionNotFoundException(com.hazelcast.jet.impl.exception.ExecutionNotFoundException) ArrayList(java.util.ArrayList) Level(java.util.logging.Level) JetException(com.hazelcast.jet.JetException) Util.jobNameAndExecutionId(com.hazelcast.jet.impl.util.Util.jobNameAndExecutionId) ILogger(com.hazelcast.logging.ILogger) Operation(com.hazelcast.spi.impl.operationservice.Operation) ExceptionUtil.withTryCatch(com.hazelcast.jet.impl.util.ExceptionUtil.withTryCatch) MemberInfo(com.hazelcast.internal.cluster.MemberInfo) Map(java.util.Map) SnapshotPhase1Operation(com.hazelcast.jet.impl.operation.SnapshotPhase1Operation) LinkedList(java.util.LinkedList) EXPORTED_SNAPSHOTS_PREFIX(com.hazelcast.jet.impl.JobRepository.EXPORTED_SNAPSHOTS_PREFIX) Nonnull(javax.annotation.Nonnull) Nullable(javax.annotation.Nullable) Tuple3(com.hazelcast.jet.datamodel.Tuple3) SnapshotFlags(com.hazelcast.jet.impl.execution.SnapshotFlags) Collection(java.util.Collection) JobRepository.snapshotDataMapName(com.hazelcast.jet.impl.JobRepository.snapshotDataMapName) ExecutionException(java.util.concurrent.ExecutionException) JobRepository.exportedSnapshotMapName(com.hazelcast.jet.impl.JobRepository.exportedSnapshotMapName) SnapshotPhase1Result(com.hazelcast.jet.impl.operation.SnapshotPhase1Operation.SnapshotPhase1Result) Tuple3.tuple3(com.hazelcast.jet.datamodel.Tuple3.tuple3) List(java.util.List) Util.idToString(com.hazelcast.jet.Util.idToString) LoggingUtil.logFine(com.hazelcast.jet.impl.util.LoggingUtil.logFine) ExecutionPlan(com.hazelcast.jet.impl.execution.init.ExecutionPlan) Entry(java.util.Map.Entry) RUNNING(com.hazelcast.jet.core.JobStatus.RUNNING) Queue(java.util.Queue) SnapshotStats(com.hazelcast.jet.impl.JobExecutionRecord.SnapshotStats) IMap(com.hazelcast.map.IMap) SnapshotStats(com.hazelcast.jet.impl.JobExecutionRecord.SnapshotStats) SnapshotPhase1Result(com.hazelcast.jet.impl.operation.SnapshotPhase1Operation.SnapshotPhase1Result) Util.idToString(com.hazelcast.jet.Util.idToString) SnapshotPhase2Operation(com.hazelcast.jet.impl.operation.SnapshotPhase2Operation) Operation(com.hazelcast.spi.impl.operationservice.Operation) SnapshotPhase1Operation(com.hazelcast.jet.impl.operation.SnapshotPhase1Operation) ExecutionNotFoundException(com.hazelcast.jet.impl.exception.ExecutionNotFoundException) JetException(com.hazelcast.jet.JetException) ExecutionException(java.util.concurrent.ExecutionException) ExecutionPlan(com.hazelcast.jet.impl.execution.init.ExecutionPlan) SnapshotPhase2Operation(com.hazelcast.jet.impl.operation.SnapshotPhase2Operation) ExecutionException(java.util.concurrent.ExecutionException)

Aggregations

MemberInfo (com.hazelcast.internal.cluster.MemberInfo)2 JetException (com.hazelcast.jet.JetException)2 Util.idToString (com.hazelcast.jet.Util.idToString)2 RUNNING (com.hazelcast.jet.core.JobStatus.RUNNING)2 EXPORTED_SNAPSHOTS_PREFIX (com.hazelcast.jet.impl.JobRepository.EXPORTED_SNAPSHOTS_PREFIX)2 Address (com.hazelcast.cluster.Address)1 LocalMemberResetException (com.hazelcast.core.LocalMemberResetException)1 Functions.entryKey (com.hazelcast.function.Functions.entryKey)1 MembersView (com.hazelcast.internal.cluster.impl.MembersView)1 Job (com.hazelcast.jet.Job)1 Util.entry (com.hazelcast.jet.Util.entry)1 NONE (com.hazelcast.jet.config.ProcessingGuarantee.NONE)1 DAG (com.hazelcast.jet.core.DAG)1 Edge (com.hazelcast.jet.core.Edge)1 Edge.between (com.hazelcast.jet.core.Edge.between)1 JobStatus (com.hazelcast.jet.core.JobStatus)1 COMPLETED (com.hazelcast.jet.core.JobStatus.COMPLETED)1 FAILED (com.hazelcast.jet.core.JobStatus.FAILED)1 NOT_RUNNING (com.hazelcast.jet.core.JobStatus.NOT_RUNNING)1 STARTING (com.hazelcast.jet.core.JobStatus.STARTING)1