use of io.mantisrx.runtime.lifecycle.Lifecycle in project mantis by Netflix.
the class WorkerExecutionOperationsNetworkStage method executeStage.
@SuppressWarnings({ "rawtypes", "unchecked" })
@Override
public void executeStage(final ExecutionDetails setup) {
ExecuteStageRequest executionRequest = setup.getExecuteStageRequest().getRequest();
// Initialize the schedulingInfo observable for current job and mark it shareable to be reused by anyone interested in this data.
// Observable<JobSchedulingInfo> selfSchedulingInfo = mantisMasterApi.schedulingChanges(executionRequest.getJobId()).switchMap((e) -> Observable.just(e).repeatWhen(x -> x.delay(5 , TimeUnit.SECONDS))).subscribeOn(Schedulers.io()).share();
Observable<JobSchedulingInfo> selfSchedulingInfo = mantisMasterApi.schedulingChanges(executionRequest.getJobId()).subscribeOn(Schedulers.io()).share();
WorkerInfo workerInfo = generateWorkerInfo(executionRequest.getJobName(), executionRequest.getJobId(), executionRequest.getStage(), executionRequest.getWorkerIndex(), executionRequest.getWorkerNumber(), executionRequest.getDurationType(), "host", executionRequest.getWorkerPorts());
final Observable<Integer> sourceStageTotalWorkersObs = createSourceStageTotalWorkersObservable(selfSchedulingInfo);
RunningWorker.Builder rwBuilder = new RunningWorker.Builder().job(setup.getMantisJob()).schedulingInfo(executionRequest.getSchedulingInfo()).stageTotalWorkersObservable(sourceStageTotalWorkersObs).jobName(executionRequest.getJobName()).stageNum(executionRequest.getStage()).workerIndex(executionRequest.getWorkerIndex()).workerNum(executionRequest.getWorkerNumber()).totalStages(executionRequest.getTotalNumStages()).metricsPort(executionRequest.getMetricsPort()).ports(executionRequest.getPorts().iterator()).jobStatusObserver(setup.getStatus()).requestSubject(setup.getExecuteStageRequest().getRequestSubject()).workerInfo(workerInfo).vmTaskStatusObservable(vmTaskStatusObserver).hasJobMaster(executionRequest.getHasJobMaster()).jobId(executionRequest.getJobId());
if (executionRequest.getStage() == 0) {
rwBuilder = rwBuilder.stage(new JobMasterStageConfig("jobmasterconfig"));
} else {
rwBuilder = rwBuilder.stage((StageConfig) setup.getMantisJob().getStages().get(executionRequest.getStage() - 1));
}
final RunningWorker rw = rwBuilder.build();
AtomicReference<SubscriptionStateHandler> subscriptionStateHandlerRef = new AtomicReference<>();
if (rw.getStageNum() == rw.getTotalStagesNet()) {
// set up subscription state handler only for sink (last) stage
subscriptionStateHandlerRef.set(setupSubscriptionStateHandler(setup.getExecuteStageRequest().getRequest().getJobId(), mantisMasterApi, setup.getExecuteStageRequest().getRequest().getSubscriptionTimeoutSecs(), setup.getExecuteStageRequest().getRequest().getMinRuntimeSecs()));
}
logger.info("Running worker info: " + rw);
rw.signalStartedInitiated();
try {
logger.info(">>>>>>>>>>>>>>>>Calling lifecycle.startup()");
Lifecycle lifecycle = rw.getJob().getLifecycle();
lifecycle.startup();
ServiceLocator serviceLocator = lifecycle.getServiceLocator();
if (lookupSpectatorRegistry) {
try {
final Registry spectatorRegistry = serviceLocator.service(Registry.class);
SpectatorRegistryFactory.setRegistry(spectatorRegistry);
} catch (Throwable t) {
logger.error("failed to init spectator registry using service locator, falling back to {}", SpectatorRegistryFactory.getRegistry().getClass().getCanonicalName());
}
}
// create job context
Parameters parameters = ParameterUtils.createContextParameters(rw.getJob().getParameterDefinitions(), setup.getParameters());
final Context context = generateContext(parameters, serviceLocator, workerInfo, MetricsRegistry.getInstance(), () -> {
rw.signalCompleted();
// wait for completion signal to go to the master and us getting killed. Upon timeout, exit.
try {
Thread.sleep(60000);
} catch (InterruptedException ie) {
logger.warn("Unexpected exception sleeping: " + ie.getMessage());
}
System.exit(0);
}, createWorkerMapObservable(selfSchedulingInfo, executionRequest.getJobName(), executionRequest.getJobId(), executionRequest.getDurationType()));
// context.setPrevStageCompletedObservable(createPrevStageCompletedObservable(selfSchedulingInfo, rw.getJobId(), rw.getStageNum()));
rw.setContext(context);
// setup heartbeats
heartbeatRef.set(new Heartbeat(rw.getJobId(), rw.getStageNum(), rw.getWorkerIndex(), rw.getWorkerNum()));
final double networkMbps = executionRequest.getSchedulingInfo().forStage(rw.getStageNum()).getMachineDefinition().getNetworkMbps();
startSendingHeartbeats(rw.getJobStatus(), new WorkerId(executionRequest.getJobId(), executionRequest.getWorkerIndex(), executionRequest.getWorkerNumber()).getId(), networkMbps);
// execute stage
if (rw.getStageNum() == 0) {
logger.info("JobId: " + rw.getJobId() + ", executing Job Master");
final AutoScaleMetricsConfig autoScaleMetricsConfig = new AutoScaleMetricsConfig();
// Temporary workaround to enable auto-scaling by custom metric in Job Master. This will be revisited to get the entire autoscaling config
// for a job as a System parameter in the JobMaster
final String autoScaleMetricString = (String) parameters.get(JOB_MASTER_AUTOSCALE_METRIC_SYSTEM_PARAM, "");
if (!Strings.isNullOrEmpty(autoScaleMetricString)) {
final List<String> tokens = Splitter.on("::").omitEmptyStrings().trimResults().splitToList(autoScaleMetricString);
if (tokens.size() == 3) {
final String metricGroup = tokens.get(0);
final String metricName = tokens.get(1);
final String algo = tokens.get(2);
try {
final AutoScaleMetricsConfig.AggregationAlgo aggregationAlgo = AutoScaleMetricsConfig.AggregationAlgo.valueOf(algo);
logger.info("registered UserDefined auto scale metric {}:{} algo {}", metricGroup, metricName, aggregationAlgo);
autoScaleMetricsConfig.addUserDefinedMetric(metricGroup, metricName, aggregationAlgo);
} catch (IllegalArgumentException e) {
final String errorMsg = String.format("ERROR: Invalid algorithm value %s for param %s (algo should be one of %s)", autoScaleMetricsConfig, JOB_MASTER_AUTOSCALE_METRIC_SYSTEM_PARAM, Arrays.stream(AutoScaleMetricsConfig.AggregationAlgo.values()).map(a -> a.name()).collect(Collectors.toList()));
logger.error(errorMsg);
throw new RuntimeException(errorMsg);
}
} else {
final String errorMsg = String.format("ERROR: Invalid value %s for param %s", autoScaleMetricString, JOB_MASTER_AUTOSCALE_METRIC_SYSTEM_PARAM);
logger.error(errorMsg);
throw new RuntimeException(errorMsg);
}
} else {
logger.info("param {} is null or empty", JOB_MASTER_AUTOSCALE_METRIC_SYSTEM_PARAM);
}
final JobMasterService jobMasterService = new JobMasterService(rw.getJobId(), rw.getSchedulingInfo(), workerMetricsClient, autoScaleMetricsConfig, mantisMasterApi, rw.getContext(), rw.getOnCompleteCallback(), rw.getOnErrorCallback(), rw.getOnTerminateCallback());
jobMasterService.start();
signalStarted(rw, subscriptionStateHandlerRef);
// block until worker terminates
rw.waitUntilTerminate();
} else if (rw.getStageNum() == 1 && rw.getTotalStagesNet() == 1) {
logger.info("JobId: " + rw.getJobId() + ", single stage job, executing entire job");
// single stage, execute entire job on this machine
PortSelector portSelector = new PortSelector() {
@Override
public int acquirePort() {
return rw.getPorts().next();
}
};
RxMetrics rxMetrics = new RxMetrics();
StageExecutors.executeSingleStageJob(rw.getJob().getSource(), rw.getStage(), rw.getJob().getSink(), portSelector, rxMetrics, rw.getContext(), rw.getOnTerminateCallback(), rw.getWorkerIndex(), rw.getSourceStageTotalWorkersObservable(), onSinkSubscribe, onSinkUnsubscribe, rw.getOnCompleteCallback(), rw.getOnErrorCallback());
signalStarted(rw, subscriptionStateHandlerRef);
// block until worker terminates
rw.waitUntilTerminate();
} else {
logger.info("JobId: " + rw.getJobId() + ", executing a multi-stage job, stage: " + rw.getStageNum());
if (rw.getStageNum() == 1) {
// execute source stage
String remoteObservableName = rw.getJobId() + "_" + rw.getStageNum();
StageSchedulingInfo currentStageSchedulingInfo = rw.getSchedulingInfo().forStage(1);
WorkerPublisherRemoteObservable publisher = new WorkerPublisherRemoteObservable<>(rw.getPorts().next(), remoteObservableName, numWorkersAtStage(selfSchedulingInfo, rw.getJobId(), rw.getStageNum() + 1), rw.getJobName());
StageExecutors.executeSource(rw.getWorkerIndex(), rw.getJob().getSource(), rw.getStage(), publisher, rw.getContext(), rw.getSourceStageTotalWorkersObservable());
logger.info("JobId: " + rw.getJobId() + " stage: " + rw.getStageNum() + ", serving remote observable for source with name: " + remoteObservableName);
RemoteRxServer server = publisher.getServer();
RxMetrics rxMetrics = server.getMetrics();
MetricsRegistry.getInstance().registerAndGet(rxMetrics.getCountersAndGauges());
signalStarted(rw, subscriptionStateHandlerRef);
logger.info("JobId: " + rw.getJobId() + " stage: " + rw.getStageNum() + ", blocking until source observable completes");
server.blockUntilServerShutdown();
} else {
// execute intermediate stage or last stage plus sink
executeNonSourceStage(selfSchedulingInfo, rw, subscriptionStateHandlerRef);
}
}
logger.info("Calling lifecycle.shutdown()");
lifecycle.shutdown();
} catch (Throwable t) {
rw.signalFailed(t);
shutdownStage();
}
}
use of io.mantisrx.runtime.lifecycle.Lifecycle in project mantis by Netflix.
the class LocalJobExecutorNetworked method execute.
@SuppressWarnings({ "rawtypes", "unchecked" })
public static void execute(Job job, SchedulingInfo schedulingInfo, Parameter... parameters) throws IllegalMantisJobException {
// validate job
try {
new ValidateJob(job).execute();
} catch (CommandException e) {
throw new IllegalMantisJobException(e);
}
// execute job
List<StageConfig> stages = job.getStages();
final SourceHolder source = job.getSource();
final SinkHolder sink = job.getSink();
final PortSelector portSelector = new PortSelectorInRange(8000, 9000);
// register netty metrics
RxNetty.useMetricListenersFactory(new MantisNettyEventsListenerFactory());
// start our metrics server
MetricsServer metricsServer = new MetricsServer(portSelector.acquirePort(), 1, Collections.EMPTY_MAP);
metricsServer.start();
Lifecycle lifecycle = job.getLifecycle();
lifecycle.startup();
// create job context
Map parameterDefinitions = job.getParameterDefinitions();
final String user = Optional.ofNullable(System.getenv("USER")).orElse("userUnknown");
String jobId = String.format("localJob-%s-%d", user, (int) (Math.random() * 10000));
logger.info("jobID {}", jobId);
final ServiceLocator serviceLocator = lifecycle.getServiceLocator();
int numInstances = schedulingInfo.forStage(1).getNumberOfInstances();
BehaviorSubject<Integer> workersInStageOneObservable = BehaviorSubject.create(numInstances);
BehaviorSubject<WorkerMap> workerMapObservable = BehaviorSubject.create();
if (stages.size() == 1) {
// single stage job
final StageConfig stage = stages.get(0);
// use latch to wait for all instances to complete
final CountDownLatch waitUntilAllCompleted = new CountDownLatch(numInstances);
Action0 countDownLatchOnComplete = new Action0() {
@Override
public void call() {
waitUntilAllCompleted.countDown();
}
};
Action0 nullOnCompleted = new Action0() {
@Override
public void call() {
}
};
Action1<Throwable> nullOnError = new Action1<Throwable>() {
@Override
public void call(Throwable t) {
}
};
Map<Integer, List<WorkerInfo>> workerInfoMap = new HashMap<>();
List<WorkerInfo> workerInfoList = new ArrayList<>();
// run for num of instances
for (int i = 0; i < numInstances; i++) {
WorkerPorts workerPorts = new WorkerPorts(portSelector.acquirePort(), portSelector.acquirePort(), portSelector.acquirePort(), portSelector.acquirePort(), portSelector.acquirePort());
WorkerInfo workerInfo = new WorkerInfo(jobId, jobId, 1, i, i + 1, MantisJobDurationType.Perpetual, "localhost", workerPorts);
workerInfoList.add(workerInfo);
Context context = new Context(ParameterUtils.createContextParameters(parameterDefinitions, parameters), lifecycle.getServiceLocator(), // new WorkerInfo(jobId, jobId, 1, i, i, MantisJobDurationType.Perpetual, "localhost", new ArrayList<>(),-1,-1),
workerInfo, MetricsRegistry.getInstance(), () -> {
System.exit(0);
}, workerMapObservable);
// workers for stage 1
workerInfoMap.put(1, workerInfoList);
workerMapObservable.onNext(new WorkerMap(workerInfoMap));
StageExecutors.executeSingleStageJob(source, stage, sink, () -> workerInfo.getWorkerPorts().getSinkPort(), new RxMetrics(), context, countDownLatchOnComplete, i, workersInStageOneObservable, null, null, nullOnCompleted, nullOnError);
}
// wait for all instances to complete
try {
waitUntilAllCompleted.await();
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
} else {
// multi-stage job
int workerNumber = 0;
// start source stages
StageConfig currentStage = stages.get(0);
StageConfig previousStage = null;
StageSchedulingInfo currentStageScalingInfo = schedulingInfo.forStage(1);
StageSchedulingInfo nextStageScalingInfo = schedulingInfo.forStage(2);
// num ports
int[] previousPorts = new int[currentStageScalingInfo.getNumberOfInstances()];
Map<Integer, List<WorkerInfo>> workerInfoMap = new HashMap<>();
List<WorkerInfo> workerInfoList = new ArrayList<>();
for (int i = 0; i < currentStageScalingInfo.getNumberOfInstances(); i++) {
WorkerPorts workerPorts = new WorkerPorts(portSelector.acquirePort(), portSelector.acquirePort(), portSelector.acquirePort(), portSelector.acquirePort(), portSelector.acquirePort());
WorkerInfo workerInfo = new WorkerInfo(jobId, jobId, 1, i, i + 1, MantisJobDurationType.Perpetual, "localhost", workerPorts);
workerInfoList.add(workerInfo);
// int sourcePort = portSelector.acquirePort();
int sourcePort = workerInfo.getWorkerPorts().getSinkPort();
previousPorts[i] = sourcePort;
Context context = new Context(ParameterUtils.createContextParameters(parameterDefinitions, parameters), serviceLocator, workerInfo, MetricsRegistry.getInstance(), nullAction, workerMapObservable);
startSource(i, sourcePort, nextStageScalingInfo.getNumberOfInstances(), job.getSource(), currentStage, context, workersInStageOneObservable);
}
// workers for stage 1
workerInfoMap.put(1, workerInfoList);
workerMapObservable.onNext(new WorkerMap(workerInfoMap));
// start intermediate stages, all but last stage
for (int i = 1; i < stages.size() - 1; i++) {
previousStage = currentStage;
StageSchedulingInfo previousStageScalingInfo = schedulingInfo.forStage(i);
// stages indexed starting at 1
currentStageScalingInfo = schedulingInfo.forStage(i + 1);
currentStage = stages.get(i);
// stages indexed starting at 1
nextStageScalingInfo = schedulingInfo.forStage(i + 2);
int[] currentPorts = new int[currentStageScalingInfo.getNumberOfInstances()];
workerInfoList = new ArrayList<>();
for (int j = 0; j < currentStageScalingInfo.getNumberOfInstances(); j++) {
WorkerPorts workerPorts = new WorkerPorts(portSelector.acquirePort(), portSelector.acquirePort(), portSelector.acquirePort(), portSelector.acquirePort(), portSelector.acquirePort());
WorkerInfo workerInfo = new WorkerInfo(jobId, jobId, i + 1, j, workerNumber++, MantisJobDurationType.Perpetual, "localhost", workerPorts);
workerInfoList.add(workerInfo);
// int port = portSelector.acquirePort();
int port = workerInfo.getWorkerPorts().getSinkPort();
currentPorts[j] = port;
Context context = new Context(ParameterUtils.createContextParameters(parameterDefinitions, parameters), serviceLocator, workerInfo, MetricsRegistry.getInstance(), nullAction, workerMapObservable);
startIntermediate(previousPorts, port, currentStage, context, j, nextStageScalingInfo.getNumberOfInstances(), i, previousStageScalingInfo.getNumberOfInstances());
}
// workers for current stage
workerInfoMap.put(i + 1, workerInfoList);
workerMapObservable.onNext(new WorkerMap(workerInfoMap));
previousPorts = currentPorts;
}
// start sink stage
StageSchedulingInfo previousStageScalingInfo = schedulingInfo.forStage(stages.size() - 1);
previousStage = stages.get(stages.size() - 2);
currentStage = stages.get(stages.size() - 1);
currentStageScalingInfo = schedulingInfo.forStage(stages.size());
numInstances = currentStageScalingInfo.getNumberOfInstances();
// use latch to wait for all instances to complete
final CountDownLatch waitUntilAllCompleted = new CountDownLatch(numInstances);
Action0 countDownLatchOnTerminated = new Action0() {
@Override
public void call() {
waitUntilAllCompleted.countDown();
}
};
Action0 nullOnCompleted = new Action0() {
@Override
public void call() {
}
};
Action1<Throwable> nullOnError = new Action1<Throwable>() {
@Override
public void call(Throwable t) {
}
};
workerInfoList = new ArrayList<>();
for (int i = 0; i < numInstances; i++) {
WorkerPorts workerPorts = new WorkerPorts(portSelector.acquirePort(), portSelector.acquirePort(), portSelector.acquirePort(), portSelector.acquirePort(), portSelector.acquirePort());
WorkerInfo workerInfo = new WorkerInfo(jobId, jobId, stages.size(), i, workerNumber++, MantisJobDurationType.Perpetual, "localhost", workerPorts);
workerInfoList.add(workerInfo);
Context context = new Context(ParameterUtils.createContextParameters(parameterDefinitions, parameters), serviceLocator, workerInfo, MetricsRegistry.getInstance(), nullAction, workerMapObservable);
startSink(previousStage, previousPorts, currentStage, () -> workerInfo.getWorkerPorts().getSinkPort(), sink, context, countDownLatchOnTerminated, nullOnCompleted, nullOnError, stages.size(), i, previousStageScalingInfo.getNumberOfInstances());
}
workerInfoMap.put(stages.size(), workerInfoList);
workerMapObservable.onNext(new WorkerMap(workerInfoMap));
// wait for all instances to complete
try {
waitUntilAllCompleted.await();
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
lifecycle.shutdown();
metricsServer.shutdown();
}
Aggregations