use of com.mesosphere.sdk.scheduler.recovery.constrain.UnconstrainedLaunchConstrainer in project dcos-commons by mesosphere.
the class ServiceTest method transientToCustomPermanentFailureTransition.
@Test
public void transientToCustomPermanentFailureTransition() throws Exception {
Protos.Offer unacceptableOffer = Protos.Offer.newBuilder().setId(Protos.OfferID.newBuilder().setValue(UUID.randomUUID().toString())).setFrameworkId(TestConstants.FRAMEWORK_ID).setSlaveId(TestConstants.AGENT_ID).setHostname(TestConstants.HOSTNAME).addResources(Protos.Resource.newBuilder().setName("mem").setType(Protos.Value.Type.SCALAR).setScalar(Protos.Value.Scalar.newBuilder().setValue(1.0))).build();
Collection<SimulationTick> ticks = new ArrayList<>();
ticks.add(Send.register());
ticks.add(Expect.reconciledImplicitly());
// Verify that service launches 1 hello pod then 2 world pods.
ticks.add(Send.offerBuilder("hello").build());
ticks.add(Expect.launchedTasks("hello-0-server"));
// Send another offer before hello-0 is finished:
ticks.add(Send.offerBuilder("world").build());
ticks.add(Expect.declinedLastOffer());
// Running, no readiness check is applicable:
ticks.add(Send.taskStatus("hello-0-server", Protos.TaskState.TASK_RUNNING).build());
// Now world-0 will deploy:
ticks.add(Send.offerBuilder("world").build());
ticks.add(Expect.launchedTasks("world-0-server"));
// With world-0's readiness check passing, world-1 still won't launch due to a hostname placement constraint:
ticks.add(Send.taskStatus("world-0-server", Protos.TaskState.TASK_RUNNING).setReadinessCheckExitCode(0).build());
// world-1 will finally launch if the offered hostname is different:
ticks.add(Send.offerBuilder("world").setHostname("host-foo").build());
ticks.add(Expect.launchedTasks("world-1-server"));
ticks.add(Send.taskStatus("world-1-server", Protos.TaskState.TASK_RUNNING).setReadinessCheckExitCode(0).build());
// *** Complete initial deployment. ***
ticks.add(Expect.allPlansComplete());
// Kill hello-0 to trigger transient recovery
ticks.add(Send.taskStatus("hello-0-server", Protos.TaskState.TASK_FAILED).build());
// Send an unused offer to trigger an evaluation of the recovery plan
ticks.add(Send.offer(unacceptableOffer));
// Expect default transient recovery triggered
ticks.add(Expect.recoveryStepStatus("hello-0:[server]", "hello-0:[server]", Status.PREPARED));
// Now trigger custom permanent replacement of that pod
ticks.add(Send.replacePod("hello-0"));
// Send an unused offer to trigger an evaluation of the recovery plan
ticks.add(Send.offer(unacceptableOffer));
// Custom expectation not relevant to other tests
Expect expectSingleRecoveryPhase = new Expect() {
@Override
public void expect(ClusterState state, SchedulerDriver mockDriver) throws AssertionError {
Plan recoveryPlan = state.getPlans().stream().filter(plan -> plan.getName().equals("recovery")).findAny().get();
Assert.assertEquals(1, recoveryPlan.getChildren().size());
}
@Override
public String getDescription() {
return "Single recovery phase";
}
};
ticks.add(expectSingleRecoveryPhase);
ticks.add(Expect.recoveryStepStatus("custom-hello-recovery", "hello-0", Status.PREPARED));
// Complete recovery
ticks.add(Send.offerBuilder("hello").build());
ticks.add(Expect.launchedTasks("hello-0-server"));
ticks.add(Send.taskStatus("hello-0-server", Protos.TaskState.TASK_RUNNING).build());
ticks.add(Expect.allPlansComplete());
new ServiceTestRunner().setRecoveryManagerFactory(new RecoveryPlanOverriderFactory() {
@Override
public RecoveryPlanOverrider create(StateStore stateStore, Collection<Plan> plans) {
return new RecoveryPlanOverrider() {
@Override
public Optional<Phase> override(PodInstanceRequirement podInstanceRequirement) {
if (podInstanceRequirement.getPodInstance().getPod().getType().equals("hello") && podInstanceRequirement.getRecoveryType().equals(RecoveryType.PERMANENT)) {
Phase phase = new DefaultPhase("custom-hello-recovery", Arrays.asList(new RecoveryStep(podInstanceRequirement.getPodInstance().getName(), podInstanceRequirement, new UnconstrainedLaunchConstrainer(), stateStore)), new SerialStrategy<>(), Collections.emptyList());
return Optional.of(phase);
}
return Optional.empty();
}
};
}
}).run(ticks);
}
use of com.mesosphere.sdk.scheduler.recovery.constrain.UnconstrainedLaunchConstrainer in project dcos-commons by mesosphere.
the class HdfsRecoveryPlanOverrider method getRecoveryPhase.
private Phase getRecoveryPhase(Plan inputPlan, int index, String phaseName) {
Phase inputPhase = getPhaseForNodeType(inputPlan, phaseName);
int offset = index * 2;
// Bootstrap
Step inputBootstrapStep = inputPhase.getChildren().get(offset + 0);
PodInstanceRequirement bootstrapPodInstanceRequirement = PodInstanceRequirement.newBuilder(inputBootstrapStep.start().get().getPodInstance(), inputBootstrapStep.start().get().getTasksToLaunch()).recoveryType(RecoveryType.PERMANENT).build();
Step bootstrapStep = new RecoveryStep(inputBootstrapStep.getName(), bootstrapPodInstanceRequirement, new UnconstrainedLaunchConstrainer(), stateStore);
// JournalNode or NameNode
Step inputNodeStep = inputPhase.getChildren().get(offset + 1);
PodInstanceRequirement nameNodePodInstanceRequirement = PodInstanceRequirement.newBuilder(inputNodeStep.start().get().getPodInstance(), inputNodeStep.start().get().getTasksToLaunch()).recoveryType(RecoveryType.TRANSIENT).build();
Step nodeStep = new RecoveryStep(inputNodeStep.getName(), nameNodePodInstanceRequirement, new UnconstrainedLaunchConstrainer(), stateStore);
return new DefaultPhase(String.format(PHASE_NAME_TEMPLATE, phaseName), Arrays.asList(bootstrapStep, nodeStep), new SerialStrategy<>(), Collections.emptyList());
}
use of com.mesosphere.sdk.scheduler.recovery.constrain.UnconstrainedLaunchConstrainer in project dcos-commons by mesosphere.
the class CassandraRecoveryPlanOverrider method getNodeRecoveryPhase.
private Phase getNodeRecoveryPhase(Plan inputPlan, int index) {
Phase inputPhase = inputPlan.getChildren().get(0);
Step inputLaunchStep = inputPhase.getChildren().get(index);
// Dig all the way down into the command, so we can append the replace_address option to it.
PodInstance podInstance = inputLaunchStep.start().get().getPodInstance();
PodSpec podSpec = podInstance.getPod();
TaskSpec taskSpec = podSpec.getTasks().stream().filter(t -> t.getName().equals("server")).findFirst().get();
CommandSpec command = taskSpec.getCommand().get();
// Get IP address for the pre-existing node.
Optional<Protos.TaskStatus> status = StateStoreUtils.getTaskStatusFromProperty(stateStore, TaskSpec.getInstanceName(podInstance, taskSpec));
if (!status.isPresent()) {
logger.error("No previously stored TaskStatus to pull IP address from in Cassandra recovery");
return null;
}
String replaceIp = status.get().getContainerStatus().getNetworkInfos(0).getIpAddresses(0).getIpAddress();
DefaultCommandSpec.Builder builder = DefaultCommandSpec.newBuilder(command);
builder.value(String.format("%s -Dcassandra.replace_address=%s -Dcassandra.consistent.rangemovement=false%n", command.getValue().trim(), replaceIp));
// Rebuild a new PodSpec with the modified command, and add it to the phase we return.
TaskSpec newTaskSpec = DefaultTaskSpec.newBuilder(taskSpec).commandSpec(builder.build()).build();
List<TaskSpec> tasks = podSpec.getTasks().stream().map(t -> {
if (t.getName().equals(newTaskSpec.getName())) {
return newTaskSpec;
}
return t;
}).collect(Collectors.toList());
PodSpec newPodSpec = DefaultPodSpec.newBuilder(podSpec).tasks(tasks).build();
PodInstance newPodInstance = new DefaultPodInstance(newPodSpec, index);
PodInstanceRequirement replacePodInstanceRequirement = PodInstanceRequirement.newBuilder(newPodInstance, inputLaunchStep.getPodInstanceRequirement().get().getTasksToLaunch()).recoveryType(RecoveryType.PERMANENT).build();
Step replaceStep = new RecoveryStep(inputLaunchStep.getName(), replacePodInstanceRequirement, new UnconstrainedLaunchConstrainer(), stateStore);
List<Step> steps = new ArrayList<>();
steps.add(replaceStep);
// Restart all other nodes if replacing a seed node to refresh IP resolution
int replaceIndex = replaceStep.getPodInstanceRequirement().get().getPodInstance().getIndex();
if (CassandraSeedUtils.isSeedNode(replaceIndex)) {
logger.info("Scheduling restart of all nodes other than 'node-{}' to refresh seed node address.", replaceIndex);
List<Step> restartSteps = inputPhase.getChildren().stream().filter(step -> step.getPodInstanceRequirement().get().getPodInstance().getIndex() != replaceIndex).map(step -> {
PodInstanceRequirement restartPodInstanceRequirement = PodInstanceRequirement.newBuilder(step.getPodInstanceRequirement().get().getPodInstance(), step.getPodInstanceRequirement().get().getTasksToLaunch()).recoveryType(RecoveryType.TRANSIENT).build();
return new RecoveryStep(step.getName(), restartPodInstanceRequirement, new UnconstrainedLaunchConstrainer(), stateStore);
}).collect(Collectors.toList());
steps.addAll(restartSteps);
}
return new DefaultPhase(RECOVERY_PHASE_NAME, steps, new SerialStrategy<>(), Collections.emptyList());
}
use of com.mesosphere.sdk.scheduler.recovery.constrain.UnconstrainedLaunchConstrainer in project dcos-commons by mesosphere.
the class SchedulerBuilder method getRecoveryPlanManager.
private PlanManager getRecoveryPlanManager(ServiceSpec serviceSpec, Optional<RecoveryPlanOverriderFactory> recoveryOverriderFactory, StateStore stateStore, ConfigStore<ServiceSpec> configStore, Collection<Plan> plans) {
List<RecoveryPlanOverrider> overrideRecoveryPlanManagers = new ArrayList<>();
if (recoveryOverriderFactory.isPresent()) {
logger.info("Adding overriding recovery plan manager.");
overrideRecoveryPlanManagers.add(recoveryOverriderFactory.get().create(stateStore, plans));
}
final LaunchConstrainer launchConstrainer;
final FailureMonitor failureMonitor;
if (serviceSpec.getReplacementFailurePolicy().isPresent()) {
ReplacementFailurePolicy failurePolicy = serviceSpec.getReplacementFailurePolicy().get();
launchConstrainer = new TimedLaunchConstrainer(Duration.ofMinutes(failurePolicy.getMinReplaceDelayMin()));
failureMonitor = new TimedFailureMonitor(Duration.ofMinutes(failurePolicy.getPermanentFailureTimoutMin()), stateStore, configStore);
} else {
launchConstrainer = new UnconstrainedLaunchConstrainer();
failureMonitor = new NeverFailureMonitor();
}
return new DefaultRecoveryPlanManager(stateStore, configStore, PlanUtils.getLaunchableTasks(plans), launchConstrainer, failureMonitor, overrideRecoveryPlanManagers);
}
Aggregations