Search in sources :

Example 1 with SnapshotFlags

use of com.hazelcast.jet.impl.execution.SnapshotFlags in project hazelcast by hazelcast.

the class MasterSnapshotContext method tryBeginSnapshot.

void tryBeginSnapshot() {
    mc.coordinationService().submitToCoordinatorThread(() -> {
        boolean isTerminal;
        String snapshotMapName;
        CompletableFuture<Void> future;
        mc.lock();
        long localExecutionId;
        try {
            if (mc.jobStatus() != RUNNING) {
                logger.fine("Not beginning snapshot, " + mc.jobIdString() + " is not RUNNING, but " + mc.jobStatus());
                return;
            }
            if (snapshotInProgress) {
                logger.fine("Not beginning snapshot since one is already in progress " + mc.jobIdString());
                return;
            }
            if (terminalSnapshotFuture.isDone()) {
                logger.fine("Not beginning snapshot since terminal snapshot is already completed " + mc.jobIdString());
                return;
            }
            Tuple3<String, Boolean, CompletableFuture<Void>> requestedSnapshot = snapshotQueue.poll();
            if (requestedSnapshot == null) {
                return;
            }
            snapshotInProgress = true;
            snapshotMapName = requestedSnapshot.f0();
            assert requestedSnapshot.f1() != null;
            isTerminal = requestedSnapshot.f1();
            future = requestedSnapshot.f2();
            mc.jobExecutionRecord().startNewSnapshot(snapshotMapName);
            localExecutionId = mc.executionId();
        } finally {
            mc.unlock();
        }
        mc.writeJobExecutionRecord(false);
        long newSnapshotId = mc.jobExecutionRecord().ongoingSnapshotId();
        boolean isExport = snapshotMapName != null;
        int snapshotFlags = SnapshotFlags.create(isTerminal, isExport);
        String finalMapName = isExport ? exportedSnapshotMapName(snapshotMapName) : snapshotDataMapName(mc.jobId(), mc.jobExecutionRecord().ongoingDataMapIndex());
        mc.nodeEngine().getHazelcastInstance().getMap(finalMapName).clear();
        logFine(logger, "Starting snapshot %d for %s, flags: %s, writing to: %s", newSnapshotId, jobNameAndExecutionId(mc.jobName(), localExecutionId), SnapshotFlags.toString(snapshotFlags), snapshotMapName);
        Function<ExecutionPlan, Operation> factory = plan -> new SnapshotPhase1Operation(mc.jobId(), localExecutionId, newSnapshotId, finalMapName, snapshotFlags);
        // Need to take a copy of executionId: we don't cancel the scheduled task when the execution
        // finalizes. If a new execution is started in the meantime, we'll use the execution ID to detect it.
        mc.invokeOnParticipants(factory, responses -> onSnapshotPhase1Complete(responses, localExecutionId, newSnapshotId, finalMapName, snapshotFlags, future), null, true);
    });
}
Also used : SnapshotPhase2Operation(com.hazelcast.jet.impl.operation.SnapshotPhase2Operation) LoggingUtil(com.hazelcast.jet.impl.util.LoggingUtil) CompletableFuture.completedFuture(java.util.concurrent.CompletableFuture.completedFuture) CompletableFuture(java.util.concurrent.CompletableFuture) Function(java.util.function.Function) ExecutionNotFoundException(com.hazelcast.jet.impl.exception.ExecutionNotFoundException) ArrayList(java.util.ArrayList) Level(java.util.logging.Level) JetException(com.hazelcast.jet.JetException) Util.jobNameAndExecutionId(com.hazelcast.jet.impl.util.Util.jobNameAndExecutionId) ILogger(com.hazelcast.logging.ILogger) Operation(com.hazelcast.spi.impl.operationservice.Operation) ExceptionUtil.withTryCatch(com.hazelcast.jet.impl.util.ExceptionUtil.withTryCatch) MemberInfo(com.hazelcast.internal.cluster.MemberInfo) Map(java.util.Map) SnapshotPhase1Operation(com.hazelcast.jet.impl.operation.SnapshotPhase1Operation) LinkedList(java.util.LinkedList) EXPORTED_SNAPSHOTS_PREFIX(com.hazelcast.jet.impl.JobRepository.EXPORTED_SNAPSHOTS_PREFIX) Nonnull(javax.annotation.Nonnull) Nullable(javax.annotation.Nullable) Tuple3(com.hazelcast.jet.datamodel.Tuple3) SnapshotFlags(com.hazelcast.jet.impl.execution.SnapshotFlags) Collection(java.util.Collection) JobRepository.snapshotDataMapName(com.hazelcast.jet.impl.JobRepository.snapshotDataMapName) ExecutionException(java.util.concurrent.ExecutionException) JobRepository.exportedSnapshotMapName(com.hazelcast.jet.impl.JobRepository.exportedSnapshotMapName) SnapshotPhase1Result(com.hazelcast.jet.impl.operation.SnapshotPhase1Operation.SnapshotPhase1Result) Tuple3.tuple3(com.hazelcast.jet.datamodel.Tuple3.tuple3) List(java.util.List) Util.idToString(com.hazelcast.jet.Util.idToString) LoggingUtil.logFine(com.hazelcast.jet.impl.util.LoggingUtil.logFine) ExecutionPlan(com.hazelcast.jet.impl.execution.init.ExecutionPlan) Entry(java.util.Map.Entry) RUNNING(com.hazelcast.jet.core.JobStatus.RUNNING) Queue(java.util.Queue) SnapshotStats(com.hazelcast.jet.impl.JobExecutionRecord.SnapshotStats) IMap(com.hazelcast.map.IMap) SnapshotPhase1Operation(com.hazelcast.jet.impl.operation.SnapshotPhase1Operation) Util.idToString(com.hazelcast.jet.Util.idToString) SnapshotPhase2Operation(com.hazelcast.jet.impl.operation.SnapshotPhase2Operation) Operation(com.hazelcast.spi.impl.operationservice.Operation) SnapshotPhase1Operation(com.hazelcast.jet.impl.operation.SnapshotPhase1Operation) CompletableFuture(java.util.concurrent.CompletableFuture) ExecutionPlan(com.hazelcast.jet.impl.execution.init.ExecutionPlan)

Example 2 with SnapshotFlags

use of com.hazelcast.jet.impl.execution.SnapshotFlags in project hazelcast by hazelcast.

the class MasterSnapshotContext method onSnapshotPhase1Complete.

/**
 * @param responses collected responses from the members
 * @param snapshotMapName the IMap name to which the snapshot is written
 * @param snapshotFlags flags of the snapshot
 * @param future a future to be completed when the phase-2 is fully completed
 */
private void onSnapshotPhase1Complete(Collection<Map.Entry<MemberInfo, Object>> responses, long executionId, long snapshotId, String snapshotMapName, int snapshotFlags, @Nullable CompletableFuture<Void> future) {
    mc.coordinationService().submitToCoordinatorThread(() -> {
        SnapshotPhase1Result mergedResult = new SnapshotPhase1Result();
        List<CompletableFuture<Void>> missingResponses = new ArrayList<>();
        for (Map.Entry<MemberInfo, Object> entry : responses) {
            // the response is either SnapshotOperationResult or an exception, see #invokeOnParticipants() method
            Object response = entry.getValue();
            if (response instanceof Throwable) {
                // all the responses to an array, and we'll wait for them later.
                if (response instanceof ExecutionNotFoundException) {
                    missingResponses.add(mc.startOperationResponses().get(entry.getKey().getAddress()));
                    continue;
                }
                response = new SnapshotPhase1Result(0, 0, 0, (Throwable) response);
            }
            mergedResult.merge((SnapshotPhase1Result) response);
        }
        if (!missingResponses.isEmpty()) {
            LoggingUtil.logFine(logger, "%s will wait for %d responses to StartExecutionOperation in " + "onSnapshotPhase1Complete()", mc.jobIdString(), missingResponses.size());
        }
        // In a typical case `missingResponses` will be empty. It will be non-empty if some member completed
        // its execution and some other did not, or near the completion of a job, e.g. after a failure.
        // `allOf` for an empty array returns a completed future immediately.
        // Another edge case is that we'll be waiting for a response to start operation from a next execution,
        // which can happen much later - we could handle it, but we ignore it: when it arrives, we'll find a
        // changed executionId and ignore the response. It also doesn't occupy a thread - we're using a future.
        CompletableFuture.allOf(missingResponses.toArray(new CompletableFuture[0])).whenComplete(withTryCatch(logger, (r, t) -> onSnapshotPhase1CompleteWithStartResponses(responses, executionId, snapshotId, snapshotMapName, snapshotFlags, future, mergedResult, missingResponses)));
    });
}
Also used : SnapshotPhase2Operation(com.hazelcast.jet.impl.operation.SnapshotPhase2Operation) LoggingUtil(com.hazelcast.jet.impl.util.LoggingUtil) CompletableFuture.completedFuture(java.util.concurrent.CompletableFuture.completedFuture) CompletableFuture(java.util.concurrent.CompletableFuture) Function(java.util.function.Function) ExecutionNotFoundException(com.hazelcast.jet.impl.exception.ExecutionNotFoundException) ArrayList(java.util.ArrayList) Level(java.util.logging.Level) JetException(com.hazelcast.jet.JetException) Util.jobNameAndExecutionId(com.hazelcast.jet.impl.util.Util.jobNameAndExecutionId) ILogger(com.hazelcast.logging.ILogger) Operation(com.hazelcast.spi.impl.operationservice.Operation) ExceptionUtil.withTryCatch(com.hazelcast.jet.impl.util.ExceptionUtil.withTryCatch) MemberInfo(com.hazelcast.internal.cluster.MemberInfo) Map(java.util.Map) SnapshotPhase1Operation(com.hazelcast.jet.impl.operation.SnapshotPhase1Operation) LinkedList(java.util.LinkedList) EXPORTED_SNAPSHOTS_PREFIX(com.hazelcast.jet.impl.JobRepository.EXPORTED_SNAPSHOTS_PREFIX) Nonnull(javax.annotation.Nonnull) Nullable(javax.annotation.Nullable) Tuple3(com.hazelcast.jet.datamodel.Tuple3) SnapshotFlags(com.hazelcast.jet.impl.execution.SnapshotFlags) Collection(java.util.Collection) JobRepository.snapshotDataMapName(com.hazelcast.jet.impl.JobRepository.snapshotDataMapName) ExecutionException(java.util.concurrent.ExecutionException) JobRepository.exportedSnapshotMapName(com.hazelcast.jet.impl.JobRepository.exportedSnapshotMapName) SnapshotPhase1Result(com.hazelcast.jet.impl.operation.SnapshotPhase1Operation.SnapshotPhase1Result) Tuple3.tuple3(com.hazelcast.jet.datamodel.Tuple3.tuple3) List(java.util.List) Util.idToString(com.hazelcast.jet.Util.idToString) LoggingUtil.logFine(com.hazelcast.jet.impl.util.LoggingUtil.logFine) ExecutionPlan(com.hazelcast.jet.impl.execution.init.ExecutionPlan) Entry(java.util.Map.Entry) RUNNING(com.hazelcast.jet.core.JobStatus.RUNNING) Queue(java.util.Queue) SnapshotStats(com.hazelcast.jet.impl.JobExecutionRecord.SnapshotStats) IMap(com.hazelcast.map.IMap) CompletableFuture(java.util.concurrent.CompletableFuture) ExecutionNotFoundException(com.hazelcast.jet.impl.exception.ExecutionNotFoundException) MemberInfo(com.hazelcast.internal.cluster.MemberInfo) SnapshotPhase1Result(com.hazelcast.jet.impl.operation.SnapshotPhase1Operation.SnapshotPhase1Result) ArrayList(java.util.ArrayList) Map(java.util.Map) IMap(com.hazelcast.map.IMap)

Example 3 with SnapshotFlags

use of com.hazelcast.jet.impl.execution.SnapshotFlags in project hazelcast by hazelcast.

the class MasterSnapshotContext method onSnapshotPhase1CompleteWithStartResponses.

private void onSnapshotPhase1CompleteWithStartResponses(Collection<Entry<MemberInfo, Object>> responses, long executionId, long snapshotId, String snapshotMapName, int snapshotFlags, @Nullable CompletableFuture<Void> future, SnapshotPhase1Result mergedResult, List<CompletableFuture<Void>> missingResponses) {
    mc.coordinationService().submitToCoordinatorThread(() -> {
        mc.lock();
        boolean isSuccess;
        SnapshotStats stats;
        try {
            if (!missingResponses.isEmpty()) {
                LoggingUtil.logFine(logger, "%s all awaited responses to StartExecutionOperation received or " + "were already received", mc.jobIdString());
            }
            // Check the execution ID to check if a new execution didn't start yet.
            if (executionId != mc.executionId()) {
                LoggingUtil.logFine(logger, "%s: ignoring responses for snapshot %s phase 1: " + "the responses are from a different execution: %s. Responses: %s", mc.jobIdString(), snapshotId, idToString(executionId), responses);
                // a new execution started, ignore this response.
                return;
            }
            for (CompletableFuture<Void> response : missingResponses) {
                assert response.isDone() : "response not done";
                try {
                    response.get();
                } catch (ExecutionException e) {
                    mergedResult.merge(new SnapshotPhase1Result(0, 0, 0, e.getCause()));
                } catch (InterruptedException e) {
                    Thread.currentThread().interrupt();
                }
            }
            IMap<Object, Object> snapshotMap = mc.nodeEngine().getHazelcastInstance().getMap(snapshotMapName);
            try {
                SnapshotValidationRecord validationRecord = new SnapshotValidationRecord(snapshotId, mergedResult.getNumChunks(), mergedResult.getNumBytes(), mc.jobExecutionRecord().ongoingSnapshotStartTime(), mc.jobId(), mc.jobName(), mc.jobRecord().getDagJson());
                // The decision moment for exported snapshots: after this the snapshot is valid to be restored
                // from, however it will be not listed by JetInstance.getJobStateSnapshots unless the validation
                // record is inserted into the cache below
                Object oldValue = snapshotMap.put(SnapshotValidationRecord.KEY, validationRecord);
                if (snapshotMapName.startsWith(EXPORTED_SNAPSHOTS_PREFIX)) {
                    String snapshotName = snapshotMapName.substring(EXPORTED_SNAPSHOTS_PREFIX.length());
                    mc.jobRepository().cacheValidationRecord(snapshotName, validationRecord);
                }
                if (oldValue != null) {
                    logger.severe("SnapshotValidationRecord overwritten after writing to '" + snapshotMapName + "' for " + mc.jobIdString() + ": snapshot data might be corrupted");
                }
            } catch (Exception e) {
                mergedResult.merge(new SnapshotPhase1Result(0, 0, 0, e));
            }
            isSuccess = mergedResult.getError() == null;
            stats = mc.jobExecutionRecord().ongoingSnapshotDone(mergedResult.getNumBytes(), mergedResult.getNumKeys(), mergedResult.getNumChunks(), mergedResult.getError());
            // the decision moment for regular snapshots: after this the snapshot is ready to be restored from
            mc.writeJobExecutionRecord(false);
            if (logger.isFineEnabled()) {
                logger.fine(String.format("Snapshot %d phase 1 for %s completed with status %s in %dms, " + "%,d bytes, %,d keys in %,d chunks, stored in '%s', proceeding to phase 2", snapshotId, mc.jobIdString(), isSuccess ? "SUCCESS" : "FAILURE", stats.duration(), stats.numBytes(), stats.numKeys(), stats.numChunks(), snapshotMapName));
            }
            if (!isSuccess) {
                logger.warning(mc.jobIdString() + " snapshot " + snapshotId + " phase 1 failed on some " + "member(s), one of the failures: " + mergedResult.getError());
                try {
                    snapshotMap.clear();
                } catch (Exception e) {
                    logger.warning(mc.jobIdString() + ": failed to clear snapshot map '" + snapshotMapName + "' after a failure", e);
                }
            }
            if (!SnapshotFlags.isExport(snapshotFlags)) {
                mc.jobRepository().clearSnapshotData(mc.jobId(), mc.jobExecutionRecord().ongoingDataMapIndex());
            }
        } finally {
            mc.unlock();
        }
        // start the phase 2
        Function<ExecutionPlan, Operation> factory = plan -> new SnapshotPhase2Operation(mc.jobId(), executionId, snapshotId, isSuccess && !SnapshotFlags.isExportOnly(snapshotFlags));
        mc.invokeOnParticipants(factory, responses2 -> onSnapshotPhase2Complete(mergedResult.getError(), responses2, executionId, snapshotId, snapshotFlags, future, stats.startTime()), null, true);
    });
}
Also used : SnapshotPhase2Operation(com.hazelcast.jet.impl.operation.SnapshotPhase2Operation) LoggingUtil(com.hazelcast.jet.impl.util.LoggingUtil) CompletableFuture.completedFuture(java.util.concurrent.CompletableFuture.completedFuture) CompletableFuture(java.util.concurrent.CompletableFuture) Function(java.util.function.Function) ExecutionNotFoundException(com.hazelcast.jet.impl.exception.ExecutionNotFoundException) ArrayList(java.util.ArrayList) Level(java.util.logging.Level) JetException(com.hazelcast.jet.JetException) Util.jobNameAndExecutionId(com.hazelcast.jet.impl.util.Util.jobNameAndExecutionId) ILogger(com.hazelcast.logging.ILogger) Operation(com.hazelcast.spi.impl.operationservice.Operation) ExceptionUtil.withTryCatch(com.hazelcast.jet.impl.util.ExceptionUtil.withTryCatch) MemberInfo(com.hazelcast.internal.cluster.MemberInfo) Map(java.util.Map) SnapshotPhase1Operation(com.hazelcast.jet.impl.operation.SnapshotPhase1Operation) LinkedList(java.util.LinkedList) EXPORTED_SNAPSHOTS_PREFIX(com.hazelcast.jet.impl.JobRepository.EXPORTED_SNAPSHOTS_PREFIX) Nonnull(javax.annotation.Nonnull) Nullable(javax.annotation.Nullable) Tuple3(com.hazelcast.jet.datamodel.Tuple3) SnapshotFlags(com.hazelcast.jet.impl.execution.SnapshotFlags) Collection(java.util.Collection) JobRepository.snapshotDataMapName(com.hazelcast.jet.impl.JobRepository.snapshotDataMapName) ExecutionException(java.util.concurrent.ExecutionException) JobRepository.exportedSnapshotMapName(com.hazelcast.jet.impl.JobRepository.exportedSnapshotMapName) SnapshotPhase1Result(com.hazelcast.jet.impl.operation.SnapshotPhase1Operation.SnapshotPhase1Result) Tuple3.tuple3(com.hazelcast.jet.datamodel.Tuple3.tuple3) List(java.util.List) Util.idToString(com.hazelcast.jet.Util.idToString) LoggingUtil.logFine(com.hazelcast.jet.impl.util.LoggingUtil.logFine) ExecutionPlan(com.hazelcast.jet.impl.execution.init.ExecutionPlan) Entry(java.util.Map.Entry) RUNNING(com.hazelcast.jet.core.JobStatus.RUNNING) Queue(java.util.Queue) SnapshotStats(com.hazelcast.jet.impl.JobExecutionRecord.SnapshotStats) IMap(com.hazelcast.map.IMap) SnapshotStats(com.hazelcast.jet.impl.JobExecutionRecord.SnapshotStats) SnapshotPhase1Result(com.hazelcast.jet.impl.operation.SnapshotPhase1Operation.SnapshotPhase1Result) Util.idToString(com.hazelcast.jet.Util.idToString) SnapshotPhase2Operation(com.hazelcast.jet.impl.operation.SnapshotPhase2Operation) Operation(com.hazelcast.spi.impl.operationservice.Operation) SnapshotPhase1Operation(com.hazelcast.jet.impl.operation.SnapshotPhase1Operation) ExecutionNotFoundException(com.hazelcast.jet.impl.exception.ExecutionNotFoundException) JetException(com.hazelcast.jet.JetException) ExecutionException(java.util.concurrent.ExecutionException) ExecutionPlan(com.hazelcast.jet.impl.execution.init.ExecutionPlan) SnapshotPhase2Operation(com.hazelcast.jet.impl.operation.SnapshotPhase2Operation) ExecutionException(java.util.concurrent.ExecutionException)

Aggregations

MemberInfo (com.hazelcast.internal.cluster.MemberInfo)3 JetException (com.hazelcast.jet.JetException)3 Util.idToString (com.hazelcast.jet.Util.idToString)3 RUNNING (com.hazelcast.jet.core.JobStatus.RUNNING)3 Tuple3 (com.hazelcast.jet.datamodel.Tuple3)3 Tuple3.tuple3 (com.hazelcast.jet.datamodel.Tuple3.tuple3)3 SnapshotStats (com.hazelcast.jet.impl.JobExecutionRecord.SnapshotStats)3 EXPORTED_SNAPSHOTS_PREFIX (com.hazelcast.jet.impl.JobRepository.EXPORTED_SNAPSHOTS_PREFIX)3 JobRepository.exportedSnapshotMapName (com.hazelcast.jet.impl.JobRepository.exportedSnapshotMapName)3 JobRepository.snapshotDataMapName (com.hazelcast.jet.impl.JobRepository.snapshotDataMapName)3 ExecutionNotFoundException (com.hazelcast.jet.impl.exception.ExecutionNotFoundException)3 SnapshotFlags (com.hazelcast.jet.impl.execution.SnapshotFlags)3 ExecutionPlan (com.hazelcast.jet.impl.execution.init.ExecutionPlan)3 SnapshotPhase1Operation (com.hazelcast.jet.impl.operation.SnapshotPhase1Operation)3 SnapshotPhase1Result (com.hazelcast.jet.impl.operation.SnapshotPhase1Operation.SnapshotPhase1Result)3 SnapshotPhase2Operation (com.hazelcast.jet.impl.operation.SnapshotPhase2Operation)3 ExceptionUtil.withTryCatch (com.hazelcast.jet.impl.util.ExceptionUtil.withTryCatch)3 LoggingUtil (com.hazelcast.jet.impl.util.LoggingUtil)3 LoggingUtil.logFine (com.hazelcast.jet.impl.util.LoggingUtil.logFine)3 Util.jobNameAndExecutionId (com.hazelcast.jet.impl.util.Util.jobNameAndExecutionId)3