use of com.hazelcast.cluster.Address in project hazelcast by hazelcast.
the class LinkedAddresses method getResolvedAddresses.
public static LinkedAddresses getResolvedAddresses(Address primaryAddress) {
LinkedAddresses linkedAddresses = new LinkedAddresses(primaryAddress);
try {
InetAddress inetAddress = primaryAddress.getInetAddress();
// ip address for the given primary address
String ip = inetAddress.getHostAddress();
Address addressIp = new Address(ip, primaryAddress.getPort());
linkedAddresses.addAddress(addressIp);
} catch (UnknownHostException e) {
// we have a hostname here in `address`, but we can't resolve it
// how on earth we could come here?
ignore(e);
}
return linkedAddresses;
}
use of com.hazelcast.cluster.Address in project hazelcast by hazelcast.
the class JobExecutionService method checkExecutions.
/**
* See also javadoc at {@link CheckLightJobsOperation}.
*/
private void checkExecutions() {
try {
long now = System.nanoTime();
long uninitializedContextThreshold = now - UNINITIALIZED_CONTEXT_MAX_AGE_NS;
Map<Address, List<Long>> executionsPerMember = new HashMap<>();
for (ExecutionContext ctx : executionContexts.values()) {
if (!ctx.isLightJob()) {
continue;
}
Address coordinator = ctx.coordinator();
if (coordinator != null) {
// if coordinator is known, add execution to the list to check
executionsPerMember.computeIfAbsent(coordinator, k -> new ArrayList<>()).add(ctx.executionId());
} else {
// if coordinator is not known, remove execution if it's not known for too long
if (ctx.getCreatedOn() <= uninitializedContextThreshold) {
LoggingUtil.logFine(logger, "Terminating light job %s because it wasn't initialized during %d seconds", idToString(ctx.executionId()), NANOSECONDS.toSeconds(UNINITIALIZED_CONTEXT_MAX_AGE_NS));
terminateExecution0(ctx, TerminationMode.CANCEL_FORCEFUL, new CancellationException());
}
}
}
// submit the query to the coordinator
for (Entry<Address, List<Long>> en : executionsPerMember.entrySet()) {
long[] executionIds = en.getValue().stream().mapToLong(Long::longValue).toArray();
Operation op = new CheckLightJobsOperation(executionIds);
InvocationFuture<long[]> future = nodeEngine.getOperationService().createInvocationBuilder(JetServiceBackend.SERVICE_NAME, op, en.getKey()).invoke();
future.whenComplete((r, t) -> {
if (t instanceof TargetNotMemberException) {
// if the target isn't a member, then all executions are unknown
r = executionIds;
} else if (t != null) {
logger.warning("Failed to check light job state with coordinator " + en.getKey() + ": " + t, t);
return;
}
assert r != null;
for (long executionId : r) {
ExecutionContext execCtx = executionContexts.get(executionId);
if (execCtx != null) {
logger.fine("Terminating light job " + idToString(executionId) + " because the coordinator doesn't know it");
terminateExecution0(execCtx, TerminationMode.CANCEL_FORCEFUL, new CancellationException());
}
}
});
}
// clean up failedJobs
failedJobs.values().removeIf(expiryTime -> expiryTime < now);
} catch (Throwable e) {
logger.severe("Failed to query live light executions: " + e, e);
}
}
use of com.hazelcast.cluster.Address in project hazelcast by hazelcast.
the class JobExecutionService method initExecution.
/**
* Initiates the given execution if the local node accepts the coordinator
* as its master, and has an up-to-date member list information.
* <ul><li>
* If the local node has a stale member list, it retries the init operation
* until it receives the new member list from the master.
* </li><li>
* If the local node detects that the member list changed after the init
* operation is sent but before executed, then it sends a graceful failure
* so that the job init will be retried properly.
* </li><li>
* If there is an already ongoing execution for the given job, then the
* init execution is retried.
* </li></ul>
*/
public void initExecution(long jobId, long executionId, Address coordinator, int coordinatorMemberListVersion, Set<MemberInfo> participants, ExecutionPlan plan) {
ExecutionContext execCtx = addExecutionContext(jobId, executionId, coordinator, coordinatorMemberListVersion, participants);
try {
jobClassloaderService.prepareProcessorClassLoaders(jobId);
Set<Address> addresses = participants.stream().map(MemberInfo::getAddress).collect(toSet());
ClassLoader jobCl = jobClassloaderService.getClassLoader(jobId);
doWithClassLoader(jobCl, () -> execCtx.initialize(coordinator, addresses, plan));
} finally {
jobClassloaderService.clearProcessorClassLoaders();
}
// initial log entry with all of jobId, jobName, executionId
logger.info("Execution plan for jobId=" + idToString(jobId) + ", jobName=" + (execCtx.jobName() != null ? '\'' + execCtx.jobName() + '\'' : "null") + ", executionId=" + idToString(executionId) + " initialized");
}
use of com.hazelcast.cluster.Address in project hazelcast by hazelcast.
the class MasterJobContext method getErrorFromResponses.
/**
* <ul>
* <li>Returns {@code null} if there is no failure
* <li>Returns a {@link CancellationException} if the job is cancelled
* forcefully.
* <li>Returns a {@link JobTerminateRequestedException} if the current
* execution is stopped due to a requested termination, except for
* CANCEL_GRACEFUL, in which case CancellationException is returned.
* <li>If there is at least one user failure, such as an exception in user
* code (restartable or not), then returns that failure.
* <li>Otherwise, the failure is because a job participant has left the
* cluster. In that case, it returns {@code TopologyChangeException} so
* that the job will be restarted
* </ul>
*/
private Throwable getErrorFromResponses(String opName, Collection<Map.Entry<MemberInfo, Object>> responses) {
if (isCancelled()) {
logger.fine(mc.jobIdString() + " to be cancelled after " + opName);
return new CancellationException();
}
Map<Boolean, List<Entry<Address, Object>>> grouped = responses.stream().map(en -> entry(en.getKey().getAddress(), en.getValue())).collect(partitioningBy(e1 -> e1.getValue() instanceof Throwable));
int successfulMembersCount = grouped.getOrDefault(false, emptyList()).size();
if (successfulMembersCount == mc.executionPlanMap().size()) {
logger.fine(opName + " of " + mc.jobIdString() + " was successful");
return null;
}
List<Entry<Address, Object>> failures = grouped.getOrDefault(true, emptyList());
if (!failures.isEmpty()) {
logger.fine(opName + " of " + mc.jobIdString() + " has failures: " + failures);
}
// other exceptions, ignore this and handle the other exception.
if (failures.stream().allMatch(entry -> entry.getValue() instanceof TerminatedWithSnapshotException)) {
assert opName.equals("Execution") : "opName is '" + opName + "', expected 'Execution'";
logger.fine(opName + " of " + mc.jobIdString() + " terminated after a terminal snapshot");
TerminationMode mode = requestedTerminationMode;
assert mode != null && mode.isWithTerminalSnapshot() : "mode=" + mode;
return mode == CANCEL_GRACEFUL ? new CancellationException() : new JobTerminateRequestedException(mode);
}
// If all exceptions are of certain type, treat it as TopologyChangedException
Map<Boolean, List<Entry<Address, Object>>> splitFailures = failures.stream().collect(Collectors.partitioningBy(e -> e.getValue() instanceof CancellationException || e.getValue() instanceof TerminatedWithSnapshotException || isTopologyException((Throwable) e.getValue())));
List<Entry<Address, Object>> topologyFailures = splitFailures.getOrDefault(true, emptyList());
List<Entry<Address, Object>> otherFailures = splitFailures.getOrDefault(false, emptyList());
if (!otherFailures.isEmpty()) {
return (Throwable) otherFailures.get(0).getValue();
} else {
return new TopologyChangedException("Causes from members: " + topologyFailures);
}
}
use of com.hazelcast.cluster.Address in project hazelcast by hazelcast.
the class Networking method createFlowControlPacket.
private Map<Address, byte[]> createFlowControlPacket() throws IOException {
class MemberData {
final BufferObjectDataOutput output = createObjectDataOutput(nodeEngine, lastFlowPacketSize);
final Connection memberConnection;
Long startedExecutionId;
MemberData(Address address) {
memberConnection = getMemberConnection(nodeEngine, address);
}
}
Map<Address, MemberData> res = new HashMap<>();
for (ExecutionContext execCtx : jobExecutionService.getExecutionContexts()) {
Map<SenderReceiverKey, ReceiverTasklet> receiverMap = execCtx.receiverMap();
if (receiverMap == null) {
continue;
}
for (Entry<SenderReceiverKey, ReceiverTasklet> en : receiverMap.entrySet()) {
assert !en.getKey().address.equals(nodeEngine.getThisAddress());
MemberData md = res.computeIfAbsent(en.getKey().address, address -> new MemberData(address));
if (md.startedExecutionId == null) {
md.startedExecutionId = execCtx.executionId();
md.output.writeLong(md.startedExecutionId);
}
assert en.getKey().vertexId != TERMINAL_VERTEX_ID;
md.output.writeInt(en.getKey().vertexId);
md.output.writeInt(en.getKey().ordinal);
md.output.writeInt(en.getValue().updateAndGetSendSeqLimitCompressed(md.memberConnection));
}
for (MemberData md : res.values()) {
if (md.startedExecutionId != null) {
// write a mark to terminate values for an execution
md.output.writeInt(TERMINAL_VERTEX_ID);
md.startedExecutionId = null;
}
}
}
for (MemberData md : res.values()) {
assert md.output.position() > 0;
// write a mark to terminate all executions
// Execution IDs are generated using Flake ID generator and those are >0 normally, we
// use MIN_VALUE as a terminator.
md.output.writeLong(TERMINAL_EXECUTION_ID);
}
// finalize the packets
int maxSize = 0;
for (Entry<Address, MemberData> entry : res.entrySet()) {
byte[] data = entry.getValue().output.toByteArray();
// we break type safety to avoid creating a new map, we replace the values to a different type in place
@SuppressWarnings({ "unchecked", "rawtypes" }) Entry<Address, byte[]> entry1 = (Entry) entry;
entry1.setValue(data);
if (data.length > maxSize) {
maxSize = data.length;
}
}
lastFlowPacketSize = maxSize;
return (Map) res;
}
Aggregations