use of com.hazelcast.jet.core.TopologyChangedException in project hazelcast by hazelcast.
the class MasterJobContext method getErrorFromResponses.
/**
* <ul>
* <li>Returns {@code null} if there is no failure
* <li>Returns a {@link CancellationException} if the job is cancelled
* forcefully.
* <li>Returns a {@link JobTerminateRequestedException} if the current
* execution is stopped due to a requested termination, except for
* CANCEL_GRACEFUL, in which case CancellationException is returned.
* <li>If there is at least one user failure, such as an exception in user
* code (restartable or not), then returns that failure.
* <li>Otherwise, the failure is because a job participant has left the
* cluster. In that case, it returns {@code TopologyChangeException} so
* that the job will be restarted
* </ul>
*/
private Throwable getErrorFromResponses(String opName, Collection<Map.Entry<MemberInfo, Object>> responses) {
if (isCancelled()) {
logger.fine(mc.jobIdString() + " to be cancelled after " + opName);
return new CancellationException();
}
Map<Boolean, List<Entry<Address, Object>>> grouped = responses.stream().map(en -> entry(en.getKey().getAddress(), en.getValue())).collect(partitioningBy(e1 -> e1.getValue() instanceof Throwable));
int successfulMembersCount = grouped.getOrDefault(false, emptyList()).size();
if (successfulMembersCount == mc.executionPlanMap().size()) {
logger.fine(opName + " of " + mc.jobIdString() + " was successful");
return null;
}
List<Entry<Address, Object>> failures = grouped.getOrDefault(true, emptyList());
if (!failures.isEmpty()) {
logger.fine(opName + " of " + mc.jobIdString() + " has failures: " + failures);
}
// other exceptions, ignore this and handle the other exception.
if (failures.stream().allMatch(entry -> entry.getValue() instanceof TerminatedWithSnapshotException)) {
assert opName.equals("Execution") : "opName is '" + opName + "', expected 'Execution'";
logger.fine(opName + " of " + mc.jobIdString() + " terminated after a terminal snapshot");
TerminationMode mode = requestedTerminationMode;
assert mode != null && mode.isWithTerminalSnapshot() : "mode=" + mode;
return mode == CANCEL_GRACEFUL ? new CancellationException() : new JobTerminateRequestedException(mode);
}
// If all exceptions are of certain type, treat it as TopologyChangedException
Map<Boolean, List<Entry<Address, Object>>> splitFailures = failures.stream().collect(Collectors.partitioningBy(e -> e.getValue() instanceof CancellationException || e.getValue() instanceof TerminatedWithSnapshotException || isTopologyException((Throwable) e.getValue())));
List<Entry<Address, Object>> topologyFailures = splitFailures.getOrDefault(true, emptyList());
List<Entry<Address, Object>> otherFailures = splitFailures.getOrDefault(false, emptyList());
if (!otherFailures.isEmpty()) {
return (Throwable) otherFailures.get(0).getValue();
} else {
return new TopologyChangedException("Causes from members: " + topologyFailures);
}
}
use of com.hazelcast.jet.core.TopologyChangedException in project hazelcast by hazelcast.
the class ExecutionPlan method initialize.
/**
* A method called on the members as part of the InitExecutionOperation.
* Creates tasklets, inboxes/outboxes and connects these to make them ready
* for a later StartExecutionOperation.
*/
public void initialize(NodeEngineImpl nodeEngine, long jobId, long executionId, @Nonnull SnapshotContext snapshotContext, ConcurrentHashMap<String, File> tempDirectories, InternalSerializationService jobSerializationService) {
this.nodeEngine = nodeEngine;
this.jobClassLoaderService = ((JetServiceBackend) nodeEngine.getService(JetServiceBackend.SERVICE_NAME)).getJobClassLoaderService();
this.executionId = executionId;
initProcSuppliers(jobId, tempDirectories, jobSerializationService);
initDag(jobSerializationService);
this.ptionArrgmt = new PartitionArrangement(partitionAssignment, nodeEngine.getThisAddress());
Set<Integer> higherPriorityVertices = VertexDef.getHigherPriorityVertices(vertices);
for (Address destAddr : remoteMembers.get()) {
Connection conn = getMemberConnection(nodeEngine, destAddr);
if (conn == null) {
throw new TopologyChangedException("no connection to job participant: " + destAddr);
}
memberConnections.put(destAddr, conn);
}
for (VertexDef vertex : vertices) {
ClassLoader processorClassLoader = isLightJob ? null : jobClassLoaderService.getProcessorClassLoader(jobId, vertex.name());
Collection<? extends Processor> processors = doWithClassLoader(processorClassLoader, () -> createProcessors(vertex, vertex.localParallelism()));
String jobPrefix = prefix(jobConfig.getName(), jobId, vertex.name());
// create StoreSnapshotTasklet and the queues to it
ConcurrentConveyor<Object> ssConveyor = null;
if (!isLightJob) {
// Note that we create the snapshot queues for all non-light jobs, even if they don't have
// processing guarantee enabled, because in EE one can request a snapshot also for
// non-snapshotted jobs.
@SuppressWarnings("unchecked") QueuedPipe<Object>[] snapshotQueues = new QueuedPipe[vertex.localParallelism()];
Arrays.setAll(snapshotQueues, i -> new OneToOneConcurrentArrayQueue<>(SNAPSHOT_QUEUE_SIZE));
ssConveyor = ConcurrentConveyor.concurrentConveyor(null, snapshotQueues);
ILogger storeSnapshotLogger = prefixedLogger(nodeEngine.getLogger(StoreSnapshotTasklet.class), jobPrefix);
StoreSnapshotTasklet ssTasklet = new StoreSnapshotTasklet(snapshotContext, ConcurrentInboundEdgeStream.create(ssConveyor, 0, 0, true, jobPrefix + "/ssFrom", null), new AsyncSnapshotWriterImpl(nodeEngine, snapshotContext, vertex.name(), memberIndex, memberCount, jobSerializationService), storeSnapshotLogger, vertex.name(), higherPriorityVertices.contains(vertex.vertexId()));
tasklets.add(ssTasklet);
}
int localProcessorIdx = 0;
for (Processor processor : processors) {
int globalProcessorIndex = memberIndex * vertex.localParallelism() + localProcessorIdx;
String processorPrefix = prefix(jobConfig.getName(), jobId, vertex.name(), globalProcessorIndex);
ILogger logger = prefixedLogger(nodeEngine.getLogger(processor.getClass()), processorPrefix);
ProcCtx context = new ProcCtx(nodeEngine, jobId, executionId, getJobConfig(), logger, vertex.name(), localProcessorIdx, globalProcessorIndex, isLightJob, partitionAssignment, vertex.localParallelism(), memberIndex, memberCount, tempDirectories, jobSerializationService, subject, processorClassLoader);
// createOutboundEdgeStreams() populates localConveyorMap and edgeSenderConveyorMap.
// Also populates instance fields: senderMap, receiverMap, tasklets.
List<OutboundEdgeStream> outboundStreams = createOutboundEdgeStreams(vertex, localProcessorIdx, jobPrefix, jobSerializationService);
List<InboundEdgeStream> inboundStreams = createInboundEdgeStreams(vertex, localProcessorIdx, jobPrefix, globalProcessorIndex);
OutboundCollector snapshotCollector = ssConveyor == null ? null : new ConveyorCollector(ssConveyor, localProcessorIdx, null);
// vertices which are only used for snapshot restore will not be marked as "source=true" in metrics
// also do not consider snapshot restore edges for determining source tag
boolean isSource = vertex.inboundEdges().stream().allMatch(EdgeDef::isSnapshotRestoreEdge) && !vertex.isSnapshotVertex();
ProcessorTasklet processorTasklet = new ProcessorTasklet(context, nodeEngine.getExecutionService().getExecutor(TASKLET_INIT_CLOSE_EXECUTOR_NAME), jobSerializationService, processor, inboundStreams, outboundStreams, snapshotContext, snapshotCollector, isSource);
tasklets.add(processorTasklet);
this.processors.add(processor);
localProcessorIdx++;
}
}
List<ReceiverTasklet> allReceivers = receiverMap.values().stream().flatMap(o -> o.values().stream()).flatMap(a -> a.values().stream()).collect(toList());
tasklets.addAll(allReceivers);
}
use of com.hazelcast.jet.core.TopologyChangedException in project hazelcast by hazelcast.
the class JobExecutionService method verifyClusterInformation.
private void verifyClusterInformation(long jobId, long executionId, Address coordinator, int coordinatorMemberListVersion, Set<MemberInfo> participants) {
Address masterAddress = nodeEngine.getMasterAddress();
ClusterServiceImpl clusterService = (ClusterServiceImpl) nodeEngine.getClusterService();
MembershipManager membershipManager = clusterService.getMembershipManager();
int localMemberListVersion = membershipManager.getMemberListVersion();
Address thisAddress = nodeEngine.getThisAddress();
if (coordinatorMemberListVersion > localMemberListVersion) {
if (masterAddress == null) {
// elected or split brain merge will happen).
throw new RetryableHazelcastException(String.format("Cannot initialize %s for coordinator %s, local member list version %s," + " coordinator member list version %s. And also, since the master address" + " is not known to this member, cannot request a new member list from master.", jobIdAndExecutionId(jobId, executionId), coordinator, localMemberListVersion, coordinatorMemberListVersion));
}
assert !masterAddress.equals(thisAddress) : String.format("Local node: %s is master but InitOperation has coordinator member list version: %s larger than " + " local member list version: %s", thisAddress, coordinatorMemberListVersion, localMemberListVersion);
nodeEngine.getOperationService().send(new TriggerMemberListPublishOp(), masterAddress);
throw new RetryableHazelcastException(String.format("Cannot initialize %s for coordinator %s, local member list version %s," + " coordinator member list version %s", jobIdAndExecutionId(jobId, executionId), coordinator, localMemberListVersion, coordinatorMemberListVersion));
}
// If the participant members can receive the new member list before the
// coordinator, and we can also get into the
// "coordinatorMemberListVersion < localMemberListVersion" case. If this
// situation occurs when a job participant leaves, then the job start will
// fail. Since the unknown participating member situation couldn't
// be resolved with retrying the InitExecutionOperation for this
// case, we do nothing here and let it fail below if some participant
// isn't found.
// The job start won't fail if this situation occurs when a new member
// is added to the cluster, because all job participants are known to the
// other participating members. The only disadvantage of this is that a
// newly added member will not be a job participant and partition mapping
// may not be completely proper in this case.
boolean isLocalMemberParticipant = false;
for (MemberInfo participant : participants) {
if (participant.getAddress().equals(thisAddress)) {
isLocalMemberParticipant = true;
}
if (membershipManager.getMember(participant.getAddress(), participant.getUuid()) == null) {
throw new TopologyChangedException(String.format("Cannot initialize %s for coordinator %s: participant %s not found in local member list." + " Local member list version: %s, coordinator member list version: %s", jobIdAndExecutionId(jobId, executionId), coordinator, participant, localMemberListVersion, coordinatorMemberListVersion));
}
}
if (!isLocalMemberParticipant) {
throw new IllegalArgumentException(String.format("Cannot initialize %s since member %s is not in participants: %s", jobIdAndExecutionId(jobId, executionId), thisAddress, participants));
}
}
use of com.hazelcast.jet.core.TopologyChangedException in project hazelcast by hazelcast.
the class AbstractJetMessageTask method getInvocationBuilder.
@Override
protected InvocationBuilder getInvocationBuilder(Operation operation) {
Address address;
if (getLightJobCoordinator() != null) {
MemberImpl member = nodeEngine.getClusterService().getMember(getLightJobCoordinator());
if (member == null) {
throw new TopologyChangedException("Light job coordinator left the cluster");
}
address = member.getAddress();
} else {
address = nodeEngine.getMasterAddress();
if (address == null) {
throw new RetryableHazelcastException("master not yet known");
}
}
return nodeEngine.getOperationService().createInvocationBuilder(JetServiceBackend.SERVICE_NAME, operation, address);
}
use of com.hazelcast.jet.core.TopologyChangedException in project hazelcast-jet by hazelcast.
the class MasterContext method getInitResult.
/**
* If there is no failure, then returns null. If the job is cancelled, then returns CancellationException.
* If there is at least one non-restartable failure, such as an exception in user code, then returns that failure.
* Otherwise, the failure is because a job participant has left the cluster.
* In that case, TopologyChangeException is returned so that the job will be restarted.
*/
private Throwable getInitResult(Map<MemberInfo, Object> responses) {
if (cancellationToken.isCompleted()) {
logger.fine(jobIdString() + " to be cancelled after init");
return new CancellationException();
}
Map<Boolean, List<Entry<MemberInfo, Object>>> grouped = groupResponses(responses);
Collection<MemberInfo> successfulMembers = grouped.get(false).stream().map(Entry::getKey).collect(toList());
if (successfulMembers.size() == executionPlanMap.size()) {
logger.fine("Init of " + jobIdString() + " is successful.");
return null;
}
List<Entry<MemberInfo, Object>> failures = grouped.get(true);
logger.fine("Init of " + jobIdString() + " failed with: " + failures);
// otherwise, return TopologyChangedException so that the job will be restarted
return failures.stream().map(e -> (Throwable) e.getValue()).filter(t -> !isTopologicalFailure(t)).findFirst().map(ExceptionUtil::peel).orElse(new TopologyChangedException());
}
Aggregations