use of com.hazelcast.jet.impl.operation.SnapshotPhase1Operation.SnapshotPhase1Result in project hazelcast by hazelcast.
the class MasterSnapshotContext method onSnapshotPhase1Complete.
/**
* @param responses collected responses from the members
* @param snapshotMapName the IMap name to which the snapshot is written
* @param snapshotFlags flags of the snapshot
* @param future a future to be completed when the phase-2 is fully completed
*/
private void onSnapshotPhase1Complete(Collection<Map.Entry<MemberInfo, Object>> responses, long executionId, long snapshotId, String snapshotMapName, int snapshotFlags, @Nullable CompletableFuture<Void> future) {
mc.coordinationService().submitToCoordinatorThread(() -> {
SnapshotPhase1Result mergedResult = new SnapshotPhase1Result();
List<CompletableFuture<Void>> missingResponses = new ArrayList<>();
for (Map.Entry<MemberInfo, Object> entry : responses) {
// the response is either SnapshotOperationResult or an exception, see #invokeOnParticipants() method
Object response = entry.getValue();
if (response instanceof Throwable) {
// all the responses to an array, and we'll wait for them later.
if (response instanceof ExecutionNotFoundException) {
missingResponses.add(mc.startOperationResponses().get(entry.getKey().getAddress()));
continue;
}
response = new SnapshotPhase1Result(0, 0, 0, (Throwable) response);
}
mergedResult.merge((SnapshotPhase1Result) response);
}
if (!missingResponses.isEmpty()) {
LoggingUtil.logFine(logger, "%s will wait for %d responses to StartExecutionOperation in " + "onSnapshotPhase1Complete()", mc.jobIdString(), missingResponses.size());
}
// In a typical case `missingResponses` will be empty. It will be non-empty if some member completed
// its execution and some other did not, or near the completion of a job, e.g. after a failure.
// `allOf` for an empty array returns a completed future immediately.
// Another edge case is that we'll be waiting for a response to start operation from a next execution,
// which can happen much later - we could handle it, but we ignore it: when it arrives, we'll find a
// changed executionId and ignore the response. It also doesn't occupy a thread - we're using a future.
CompletableFuture.allOf(missingResponses.toArray(new CompletableFuture[0])).whenComplete(withTryCatch(logger, (r, t) -> onSnapshotPhase1CompleteWithStartResponses(responses, executionId, snapshotId, snapshotMapName, snapshotFlags, future, mergedResult, missingResponses)));
});
}
use of com.hazelcast.jet.impl.operation.SnapshotPhase1Operation.SnapshotPhase1Result in project hazelcast by hazelcast.
the class SnapshotContext method handlePhase1Done.
private synchronized void handlePhase1Done() {
if (isCancelled) {
assert phase1Future == null : "phase1Future=" + phase1Future;
return;
}
phase1Future.complete(new SnapshotPhase1Result(totalBytes.get(), totalKeys.get(), totalChunks.get(), snapshotError.get()));
phase1Future = null;
snapshotError.set(null);
totalBytes.set(0);
totalKeys.set(0);
totalChunks.set(0);
currentMapName = null;
}
use of com.hazelcast.jet.impl.operation.SnapshotPhase1Operation.SnapshotPhase1Result in project hazelcast by hazelcast.
the class MasterSnapshotContext method onSnapshotPhase1CompleteWithStartResponses.
private void onSnapshotPhase1CompleteWithStartResponses(Collection<Entry<MemberInfo, Object>> responses, long executionId, long snapshotId, String snapshotMapName, int snapshotFlags, @Nullable CompletableFuture<Void> future, SnapshotPhase1Result mergedResult, List<CompletableFuture<Void>> missingResponses) {
mc.coordinationService().submitToCoordinatorThread(() -> {
mc.lock();
boolean isSuccess;
SnapshotStats stats;
try {
if (!missingResponses.isEmpty()) {
LoggingUtil.logFine(logger, "%s all awaited responses to StartExecutionOperation received or " + "were already received", mc.jobIdString());
}
// Check the execution ID to check if a new execution didn't start yet.
if (executionId != mc.executionId()) {
LoggingUtil.logFine(logger, "%s: ignoring responses for snapshot %s phase 1: " + "the responses are from a different execution: %s. Responses: %s", mc.jobIdString(), snapshotId, idToString(executionId), responses);
// a new execution started, ignore this response.
return;
}
for (CompletableFuture<Void> response : missingResponses) {
assert response.isDone() : "response not done";
try {
response.get();
} catch (ExecutionException e) {
mergedResult.merge(new SnapshotPhase1Result(0, 0, 0, e.getCause()));
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
IMap<Object, Object> snapshotMap = mc.nodeEngine().getHazelcastInstance().getMap(snapshotMapName);
try {
SnapshotValidationRecord validationRecord = new SnapshotValidationRecord(snapshotId, mergedResult.getNumChunks(), mergedResult.getNumBytes(), mc.jobExecutionRecord().ongoingSnapshotStartTime(), mc.jobId(), mc.jobName(), mc.jobRecord().getDagJson());
// The decision moment for exported snapshots: after this the snapshot is valid to be restored
// from, however it will be not listed by JetInstance.getJobStateSnapshots unless the validation
// record is inserted into the cache below
Object oldValue = snapshotMap.put(SnapshotValidationRecord.KEY, validationRecord);
if (snapshotMapName.startsWith(EXPORTED_SNAPSHOTS_PREFIX)) {
String snapshotName = snapshotMapName.substring(EXPORTED_SNAPSHOTS_PREFIX.length());
mc.jobRepository().cacheValidationRecord(snapshotName, validationRecord);
}
if (oldValue != null) {
logger.severe("SnapshotValidationRecord overwritten after writing to '" + snapshotMapName + "' for " + mc.jobIdString() + ": snapshot data might be corrupted");
}
} catch (Exception e) {
mergedResult.merge(new SnapshotPhase1Result(0, 0, 0, e));
}
isSuccess = mergedResult.getError() == null;
stats = mc.jobExecutionRecord().ongoingSnapshotDone(mergedResult.getNumBytes(), mergedResult.getNumKeys(), mergedResult.getNumChunks(), mergedResult.getError());
// the decision moment for regular snapshots: after this the snapshot is ready to be restored from
mc.writeJobExecutionRecord(false);
if (logger.isFineEnabled()) {
logger.fine(String.format("Snapshot %d phase 1 for %s completed with status %s in %dms, " + "%,d bytes, %,d keys in %,d chunks, stored in '%s', proceeding to phase 2", snapshotId, mc.jobIdString(), isSuccess ? "SUCCESS" : "FAILURE", stats.duration(), stats.numBytes(), stats.numKeys(), stats.numChunks(), snapshotMapName));
}
if (!isSuccess) {
logger.warning(mc.jobIdString() + " snapshot " + snapshotId + " phase 1 failed on some " + "member(s), one of the failures: " + mergedResult.getError());
try {
snapshotMap.clear();
} catch (Exception e) {
logger.warning(mc.jobIdString() + ": failed to clear snapshot map '" + snapshotMapName + "' after a failure", e);
}
}
if (!SnapshotFlags.isExport(snapshotFlags)) {
mc.jobRepository().clearSnapshotData(mc.jobId(), mc.jobExecutionRecord().ongoingDataMapIndex());
}
} finally {
mc.unlock();
}
// start the phase 2
Function<ExecutionPlan, Operation> factory = plan -> new SnapshotPhase2Operation(mc.jobId(), executionId, snapshotId, isSuccess && !SnapshotFlags.isExportOnly(snapshotFlags));
mc.invokeOnParticipants(factory, responses2 -> onSnapshotPhase2Complete(mergedResult.getError(), responses2, executionId, snapshotId, snapshotFlags, future, stats.startTime()), null, true);
});
}
use of com.hazelcast.jet.impl.operation.SnapshotPhase1Operation.SnapshotPhase1Result in project hazelcast by hazelcast.
the class SnapshotContext method startNewSnapshotPhase1.
/**
* This method is called when the member received {@link
* SnapshotPhase1Operation}.
*/
synchronized CompletableFuture<SnapshotPhase1Result> startNewSnapshotPhase1(long snapshotId, String mapName, int flags) {
if (snapshotId == currentSnapshotId) {
// if not impossible.
throw new RuntimeException("new snapshotId equal to previous, operation probably retried. Previous=" + currentSnapshotId + ", new=" + snapshotId);
}
assert snapshotId == currentSnapshotId + 1 : "New snapshotId for " + jobNameAndExecutionId + " not incremented by 1. " + "Previous=" + currentSnapshotId + ", new=" + snapshotId;
assert currentSnapshotId == activeSnapshotIdPhase1 : "last snapshot was postponed but not started";
assert numSsTasklets >= 0 : "numSsTasklets=" + numSsTasklets;
assert phase1Future == null : "phase 1 already in progress";
assert phase2Future == null : "phase 2 still ongoing";
assert snapshotId == activeSnapshotIdPhase2 + 1 : "snapshotId=" + snapshotId + ", activeSnapshotIdPhase2=" + activeSnapshotIdPhase2;
if (isCancelled) {
throw new CancellationException("execution cancelled");
}
this.snapshotFlags = flags;
boolean success = numRemainingTasklets.compareAndSet(0, numSsTasklets);
assert success : "numRemainingTasklets wasn't 0, but " + numRemainingTasklets.get();
currentSnapshotId = snapshotId;
currentMapName = mapName;
if (numPrioritySsTasklets == 0) {
// if there are no higher priority tasklets, start the snapshot immediately
activeSnapshotIdPhase1 = currentSnapshotId;
} else {
// the snapshot will be started once all higher priority sources are done
// see #taskletDone()
logger.info("Snapshot " + snapshotId + " for " + jobNameAndExecutionId + " is postponed" + " until all higher priority vertices are completed (number of such vertices = " + numPrioritySsTasklets + ')');
}
if (numSsTasklets == 0) {
// member is already done with the job and master didn't know it yet - we are immediately successful
return completedFuture(new SnapshotPhase1Result(0, 0, 0, null));
}
phase1Future = new CompletableFuture<>();
return phase1Future;
}
Aggregations