use of org.apache.flink.runtime.clusterframework.messages.StopCluster in project flink by apache.
the class ClusterShutdownITCase method testClusterShutdownWithoutResourceManager.
/**
* Tests a faked cluster shutdown procedure without the ResourceManager.
*/
@Test
public void testClusterShutdownWithoutResourceManager() {
new JavaTestKit(system) {
{
new Within(duration("30 seconds")) {
@Override
protected void run() {
ActorGateway me = TestingUtils.createForwardingActor(system, getTestActor(), Option.<String>empty());
// start job manager which doesn't shutdown the actor system
ActorGateway jobManager = TestingUtils.createJobManager(system, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), config, "jobmanager1");
// Tell the JobManager to inform us of shutdown actions
jobManager.tell(TestingMessages.getNotifyOfComponentShutdown(), me);
// Register a TaskManager
ActorGateway taskManager = TestingUtils.createTaskManager(system, jobManager, config, true, true);
// Tell the TaskManager to inform us of TaskManager shutdowns
taskManager.tell(TestingMessages.getNotifyOfComponentShutdown(), me);
// No resource manager connected
jobManager.tell(new StopCluster(ApplicationStatus.SUCCEEDED, "Shutting down."), me);
expectMsgAllOf(new TestingMessages.ComponentShutdown(taskManager.actor()), new TestingMessages.ComponentShutdown(jobManager.actor()), StopClusterSuccessful.getInstance());
}
};
}
};
}
use of org.apache.flink.runtime.clusterframework.messages.StopCluster in project flink by apache.
the class YarnFlinkResourceManager method containersComplete.
/**
* Invoked when the ResourceManager informs of completed containers.
* Called via an actor message by the callback from the ResourceManager client.
*
* @param containers The containers that have completed.
*/
private void containersComplete(List<ContainerStatus> containers) {
for (ContainerStatus status : containers) {
final ResourceID id = new ResourceID(status.getContainerId().toString());
// check if this is a failed container or a completed container
if (containersBeingReturned.remove(status.getContainerId()) != null) {
// regular completed container that we released
LOG.info("Container {} completed successfully with diagnostics: {}", id, status.getDiagnostics());
} else {
// failed container, either at startup, or running
final String exitStatus;
switch(status.getExitStatus()) {
case -103:
exitStatus = "Vmem limit exceeded (-103)";
break;
case -104:
exitStatus = "Pmem limit exceeded (-104)";
break;
default:
exitStatus = String.valueOf(status.getExitStatus());
}
final YarnContainerInLaunch launched = containersInLaunch.remove(id);
if (launched != null) {
LOG.info("Container {} failed, with a TaskManager in launch or registration. " + "Exit status: {}", id, exitStatus);
// we will trigger re-acquiring new containers at the end
} else {
// failed registered worker
LOG.info("Container {} failed. Exit status: {}", id, exitStatus);
// notify the generic logic, which notifies the JobManager, etc.
notifyWorkerFailed(id, "Container " + id + " failed. " + "Exit status: {}" + exitStatus);
}
// general failure logging
failedContainersSoFar++;
String diagMessage = String.format("Diagnostics for container %s in state %s : " + "exitStatus=%s diagnostics=%s", id, status.getState(), exitStatus, status.getDiagnostics());
sendInfoMessage(diagMessage);
LOG.info(diagMessage);
LOG.info("Total number of failed containers so far: " + failedContainersSoFar);
// maxFailedContainers == -1 is infinite number of retries.
if (maxFailedContainers >= 0 && failedContainersSoFar > maxFailedContainers) {
String msg = "Stopping YARN session because the number of failed containers (" + failedContainersSoFar + ") exceeded the maximum failed containers (" + maxFailedContainers + "). This number is controlled by the '" + ConfigConstants.YARN_MAX_FAILED_CONTAINERS + "' configuration setting. " + "By default its the number of requested containers.";
LOG.error(msg);
self().tell(decorateMessage(new StopCluster(ApplicationStatus.FAILED, msg)), ActorRef.noSender());
// no need to do anything else
return;
}
}
}
updateProgress();
// in case failed containers were among the finished containers, make
// sure we re-examine and request new ones
triggerCheckWorkers();
}
use of org.apache.flink.runtime.clusterframework.messages.StopCluster in project flink by apache.
the class MesosFlinkResourceManager method taskTerminated.
/**
* Invoked when a Mesos task reaches a terminal status.
*/
private void taskTerminated(Protos.TaskID taskID, Protos.TaskStatus status) {
// this callback occurs for failed containers and for released containers alike
final ResourceID id = extractResourceID(taskID);
boolean existed;
try {
existed = workerStore.removeWorker(taskID);
} catch (Exception ex) {
fatalError("unable to remove worker", ex);
return;
}
if (!existed) {
LOG.info("Received a termination notice for an unrecognized worker: {}", id);
return;
}
// check if this is a failed task or a released task
if (workersBeingReturned.remove(id) != null) {
// regular finished worker that we released
LOG.info("Worker {} finished successfully with diagnostics: {}", id, status.getMessage());
} else {
// failed worker, either at startup, or running
final MesosWorkerStore.Worker launched = workersInLaunch.remove(id);
if (launched != null) {
LOG.info("Mesos task {} failed, with a TaskManager in launch or registration. " + "State: {} Reason: {} ({})", id, status.getState(), status.getReason(), status.getMessage());
// we will trigger re-acquiring new workers at the end
} else {
// failed registered worker
LOG.info("Mesos task {} failed, with a registered TaskManager. " + "State: {} Reason: {} ({})", id, status.getState(), status.getReason(), status.getMessage());
// notify the generic logic, which notifies the JobManager, etc.
notifyWorkerFailed(id, "Mesos task " + id + " failed. State: " + status.getState());
}
// general failure logging
failedTasksSoFar++;
String diagMessage = String.format("Diagnostics for task %s in state %s : " + "reason=%s message=%s", id, status.getState(), status.getReason(), status.getMessage());
sendInfoMessage(diagMessage);
LOG.info(diagMessage);
LOG.info("Total number of failed tasks so far: {}", failedTasksSoFar);
// maxFailedTasks == -1 is infinite number of retries.
if (maxFailedTasks >= 0 && failedTasksSoFar > maxFailedTasks) {
String msg = "Stopping Mesos session because the number of failed tasks (" + failedTasksSoFar + ") exceeded the maximum failed tasks (" + maxFailedTasks + "). This number is controlled by the '" + ConfigConstants.MESOS_MAX_FAILED_TASKS + "' configuration setting. " + "By default its the number of requested tasks.";
LOG.error(msg);
self().tell(decorateMessage(new StopCluster(ApplicationStatus.FAILED, msg)), ActorRef.noSender());
// no need to do anything else
return;
}
}
// in case failed containers were among the finished containers, make
// sure we re-examine and request new ones
triggerCheckWorkers();
}
use of org.apache.flink.runtime.clusterframework.messages.StopCluster in project flink by apache.
the class FlinkResourceManager method handleMessage.
/**
*
* This method receives the actor messages after they have been filtered for
* a match with the leader session.
*
* @param message The incoming actor message.
*/
@Override
protected void handleMessage(Object message) {
try {
if (message instanceof CheckAndAllocateContainers) {
checkWorkersPool();
} else if (message instanceof SetWorkerPoolSize) {
SetWorkerPoolSize msg = (SetWorkerPoolSize) message;
adjustDesignatedNumberOfWorkers(msg.numberOfWorkers());
} else if (message instanceof RemoveResource) {
RemoveResource msg = (RemoveResource) message;
removeRegisteredResource(msg.resourceId());
} else if (message instanceof NotifyResourceStarted) {
NotifyResourceStarted msg = (NotifyResourceStarted) message;
handleResourceStarted(sender(), msg.getResourceID());
} else if (message instanceof NewLeaderAvailable) {
NewLeaderAvailable msg = (NewLeaderAvailable) message;
newJobManagerLeaderAvailable(msg.leaderAddress(), msg.leaderSessionId());
} else if (message instanceof TriggerRegistrationAtJobManager) {
TriggerRegistrationAtJobManager msg = (TriggerRegistrationAtJobManager) message;
triggerConnectingToJobManager(msg.jobManagerAddress());
} else if (message instanceof RegisterResourceManagerSuccessful) {
RegisterResourceManagerSuccessful msg = (RegisterResourceManagerSuccessful) message;
jobManagerLeaderConnected(msg.jobManager(), msg.currentlyRegisteredTaskManagers());
} else if (message instanceof StopCluster) {
StopCluster msg = (StopCluster) message;
shutdownCluster(msg.finalStatus(), msg.message());
sender().tell(decorateMessage(StopClusterSuccessful.getInstance()), ActorRef.noSender());
} else if (message instanceof RegisterInfoMessageListener) {
if (jobManager != null) {
infoMessageListeners.add(sender());
sender().tell(decorateMessage(RegisterInfoMessageListenerSuccessful.get()), // answer as the JobManager
jobManager);
}
} else if (message instanceof UnRegisterInfoMessageListener) {
infoMessageListeners.remove(sender());
} else if (message instanceof FatalErrorOccurred) {
FatalErrorOccurred fatalErrorOccurred = (FatalErrorOccurred) message;
fatalError(fatalErrorOccurred.message(), fatalErrorOccurred.error());
} else // --- unknown messages
{
LOG.error("Discarding unknown message: {}", message);
}
} catch (Throwable t) {
// fatal error, needs master recovery
fatalError("Error processing actor message", t);
}
}
use of org.apache.flink.runtime.clusterframework.messages.StopCluster in project flink by apache.
the class ClusterShutdownITCase method testClusterShutdownWithResourceManager.
/**
* Tests a faked cluster shutdown procedure with the ResourceManager.
*/
@Test
public void testClusterShutdownWithResourceManager() {
new JavaTestKit(system) {
{
new Within(duration("30 seconds")) {
@Override
protected void run() {
ActorGateway me = TestingUtils.createForwardingActor(system, getTestActor(), Option.<String>empty());
// start job manager which doesn't shutdown the actor system
ActorGateway jobManager = TestingUtils.createJobManager(system, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), config, "jobmanager2");
// Tell the JobManager to inform us of shutdown actions
jobManager.tell(TestingMessages.getNotifyOfComponentShutdown(), me);
// Register a TaskManager
ActorGateway taskManager = TestingUtils.createTaskManager(system, jobManager, config, true, true);
// Tell the TaskManager to inform us of TaskManager shutdowns
taskManager.tell(TestingMessages.getNotifyOfComponentShutdown(), me);
// Start resource manager and let it register
ActorGateway resourceManager = TestingUtils.createResourceManager(system, jobManager.actor(), config);
// Tell the ResourceManager to inform us of ResourceManager shutdowns
resourceManager.tell(TestingMessages.getNotifyOfComponentShutdown(), me);
// notify about a resource manager registration at the job manager
resourceManager.tell(new TestingResourceManager.NotifyWhenResourceManagerConnected(), me);
// Wait for resource manager
expectMsgEquals(Acknowledge.get());
// Shutdown cluster with resource manager connected
jobManager.tell(new StopCluster(ApplicationStatus.SUCCEEDED, "Shutting down."), me);
expectMsgAllOf(new TestingMessages.ComponentShutdown(taskManager.actor()), new TestingMessages.ComponentShutdown(jobManager.actor()), new TestingMessages.ComponentShutdown(resourceManager.actor()), StopClusterSuccessful.getInstance());
}
};
}
};
}
Aggregations