use of com.hazelcast.jet.impl.metrics.RawJobMetrics in project hazelcast by hazelcast.
the class JobRepository method completeJob.
/**
* Puts a JobResult for the given job and deletes the JobRecord.
*
* @throws JobNotFoundException if the JobRecord is not found
* @throws IllegalStateException if the JobResult is already present
*/
void completeJob(@Nonnull MasterContext masterContext, @Nullable List<RawJobMetrics> terminalMetrics, @Nullable Throwable error, long completionTime) {
long jobId = masterContext.jobId();
JobConfig config = masterContext.jobRecord().getConfig();
long creationTime = masterContext.jobRecord().getCreationTime();
JobResult jobResult = new JobResult(jobId, config, creationTime, completionTime, toErrorMsg(error));
if (terminalMetrics != null) {
try {
List<RawJobMetrics> prevMetrics = jobMetrics.get().put(jobId, terminalMetrics);
if (prevMetrics != null) {
logger.warning("Overwriting job metrics for job " + jobResult);
}
} catch (Exception e) {
logger.warning("Storing the job metrics failed, ignoring: " + e, e);
}
}
for (; ; ) {
// keep trying to store the JobResult until it succeeds
try {
jobResults.get().set(jobId, jobResult);
break;
} catch (Exception e) {
// if the local instance was shut down, re-throw the error
LifecycleService lifecycleService = instance.getLifecycleService();
if (e instanceof HazelcastInstanceNotActiveException && (!lifecycleService.isRunning())) {
throw e;
}
// retry otherwise, after a delay
long retryTimeoutSeconds = 1;
logger.warning("Failed to store JobResult, will retry in " + retryTimeoutSeconds + " seconds: " + e, e);
LockSupport.parkNanos(SECONDS.toNanos(retryTimeoutSeconds));
}
}
deleteJob(jobId);
}
use of com.hazelcast.jet.impl.metrics.RawJobMetrics in project hazelcast by hazelcast.
the class JobMetricsUtil method toJobMetrics.
static JobMetrics toJobMetrics(List<RawJobMetrics> rawJobMetrics) {
JobMetricsConsumer consumer = null;
for (RawJobMetrics metrics : rawJobMetrics) {
if (metrics.getBlob() == null) {
continue;
}
if (consumer == null) {
consumer = new JobMetricsConsumer();
}
consumer.timestamp = metrics.getTimestamp();
MetricsCompressor.extractMetrics(metrics.getBlob(), consumer);
}
return consumer == null ? JobMetrics.empty() : JobMetrics.of(consumer.metrics);
}
use of com.hazelcast.jet.impl.metrics.RawJobMetrics in project hazelcast by hazelcast.
the class JobExecutionService method runLightJob.
public CompletableFuture<RawJobMetrics> runLightJob(long jobId, long executionId, Address coordinator, int coordinatorMemberListVersion, Set<MemberInfo> participants, ExecutionPlan plan) {
assert executionId == jobId : "executionId(" + idToString(executionId) + ") != jobId(" + idToString(jobId) + ")";
verifyClusterInformation(jobId, executionId, coordinator, coordinatorMemberListVersion, participants);
failIfNotRunning();
ExecutionContext execCtx;
synchronized (mutex) {
addExecutionContextJobId(jobId, executionId, coordinator);
execCtx = executionContexts.computeIfAbsent(executionId, x -> new ExecutionContext(nodeEngine, jobId, executionId, true));
}
try {
Set<Address> addresses = participants.stream().map(MemberInfo::getAddress).collect(toSet());
ClassLoader jobCl = jobClassloaderService.getClassLoader(jobId);
// We don't create the CL for light jobs.
assert jobClassloaderService.getClassLoader(jobId) == null;
doWithClassLoader(jobCl, () -> execCtx.initialize(coordinator, addresses, plan));
} catch (Throwable e) {
completeExecution(execCtx, new CancellationException());
throw e;
}
// initial log entry with all of jobId, jobName, executionId
if (logger.isFineEnabled()) {
logger.fine("Execution plan for light job ID=" + idToString(jobId) + ", jobName=" + (execCtx.jobName() != null ? '\'' + execCtx.jobName() + '\'' : "null") + ", executionId=" + idToString(executionId) + " initialized, will start the execution");
}
return beginExecution0(execCtx, false);
}
use of com.hazelcast.jet.impl.metrics.RawJobMetrics in project hazelcast by hazelcast.
the class MasterJobContext method onStartExecutionComplete.
private void onStartExecutionComplete(Throwable error, Collection<Entry<MemberInfo, Object>> responses) {
JobStatus status = mc.jobStatus();
if (status != STARTING && status != RUNNING) {
logCannotComplete(error);
error = new IllegalStateException("Job coordination failed");
}
setJobMetrics(responses.stream().filter(en -> en.getValue() instanceof RawJobMetrics).map(e1 -> (RawJobMetrics) e1.getValue()).collect(Collectors.toList()));
if (error instanceof JobTerminateRequestedException && ((JobTerminateRequestedException) error).mode().isWithTerminalSnapshot()) {
Throwable finalError = error;
// The terminal snapshot on members is always completed before replying to StartExecutionOp.
// However, the response to snapshot operations can be processed after the response to
// StartExecutionOp, so wait for that too.
mc.snapshotContext().terminalSnapshotFuture().whenCompleteAsync(withTryCatch(logger, (r, e) -> finalizeJob(finalError)));
} else {
if (error instanceof ExecutionNotFoundException) {
// If the StartExecutionOperation didn't find the execution, it means that it was cancelled.
if (requestedTerminationMode != null) {
// This cancellation can be because the master cancelled it. If that's the case, convert the exception
// to JobTerminateRequestedException.
error = new JobTerminateRequestedException(requestedTerminationMode).initCause(error);
}
// The cancellation can also happen if some participant left and
// the target cancelled the execution locally in JobExecutionService.onMemberRemoved().
// We keep this (and possibly other) exceptions as they are
// and let the execution complete with failure.
}
finalizeJob(error);
}
}
Aggregations