Search in sources :

Example 1 with ApplicationFinishEvent

use of org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationFinishEvent in project hadoop by apache.

the class ContainerManagerImpl method handle.

@SuppressWarnings("unchecked")
@Override
public void handle(ContainerManagerEvent event) {
    switch(event.getType()) {
        case FINISH_APPS:
            CMgrCompletedAppsEvent appsFinishedEvent = (CMgrCompletedAppsEvent) event;
            for (ApplicationId appID : appsFinishedEvent.getAppsToCleanup()) {
                Application app = this.context.getApplications().get(appID);
                if (app == null) {
                    LOG.warn("couldn't find application " + appID + " while processing" + " FINISH_APPS event");
                    continue;
                }
                boolean shouldDropEvent = false;
                for (Container container : app.getContainers().values()) {
                    if (container.isRecovering()) {
                        LOG.info("drop FINISH_APPS event to " + appID + " because " + "container " + container.getContainerId() + " is recovering");
                        shouldDropEvent = true;
                        break;
                    }
                }
                if (shouldDropEvent) {
                    continue;
                }
                String diagnostic = "";
                if (appsFinishedEvent.getReason() == CMgrCompletedAppsEvent.Reason.ON_SHUTDOWN) {
                    diagnostic = "Application killed on shutdown";
                } else if (appsFinishedEvent.getReason() == CMgrCompletedAppsEvent.Reason.BY_RESOURCEMANAGER) {
                    diagnostic = "Application killed by ResourceManager";
                }
                this.dispatcher.getEventHandler().handle(new ApplicationFinishEvent(appID, diagnostic));
            }
            break;
        case FINISH_CONTAINERS:
            CMgrCompletedContainersEvent containersFinishedEvent = (CMgrCompletedContainersEvent) event;
            for (ContainerId containerId : containersFinishedEvent.getContainersToCleanup()) {
                ApplicationId appId = containerId.getApplicationAttemptId().getApplicationId();
                Application app = this.context.getApplications().get(appId);
                if (app == null) {
                    LOG.warn("couldn't find app " + appId + " while processing" + " FINISH_CONTAINERS event");
                    continue;
                }
                Container container = app.getContainers().get(containerId);
                if (container == null) {
                    LOG.warn("couldn't find container " + containerId + " while processing FINISH_CONTAINERS event");
                    continue;
                }
                if (container.isRecovering()) {
                    LOG.info("drop FINISH_CONTAINERS event to " + containerId + " because container is recovering");
                    continue;
                }
                this.dispatcher.getEventHandler().handle(new ContainerKillEvent(containerId, ContainerExitStatus.KILLED_BY_RESOURCEMANAGER, "Container Killed by ResourceManager"));
            }
            break;
        case DECREASE_CONTAINERS_RESOURCE:
            CMgrDecreaseContainersResourceEvent containersDecreasedEvent = (CMgrDecreaseContainersResourceEvent) event;
            for (org.apache.hadoop.yarn.api.records.Container container : containersDecreasedEvent.getContainersToDecrease()) {
                try {
                    changeContainerResourceInternal(container.getId(), container.getVersion(), container.getResource(), false);
                } catch (YarnException e) {
                    LOG.error("Unable to decrease container resource", e);
                } catch (IOException e) {
                    LOG.error("Unable to update container resource in store", e);
                }
            }
            break;
        case SIGNAL_CONTAINERS:
            CMgrSignalContainersEvent containersSignalEvent = (CMgrSignalContainersEvent) event;
            for (SignalContainerRequest request : containersSignalEvent.getContainersToSignal()) {
                internalSignalToContainer(request, "ResourceManager");
            }
            break;
        default:
            throw new YarnRuntimeException("Got an unknown ContainerManagerEvent type: " + event.getType());
    }
}
Also used : ApplicationFinishEvent(org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationFinishEvent) CMgrDecreaseContainersResourceEvent(org.apache.hadoop.yarn.server.nodemanager.CMgrDecreaseContainersResourceEvent) CMgrSignalContainersEvent(org.apache.hadoop.yarn.server.nodemanager.CMgrSignalContainersEvent) CMgrCompletedContainersEvent(org.apache.hadoop.yarn.server.nodemanager.CMgrCompletedContainersEvent) SignalContainerRequest(org.apache.hadoop.yarn.api.protocolrecords.SignalContainerRequest) CMgrCompletedAppsEvent(org.apache.hadoop.yarn.server.nodemanager.CMgrCompletedAppsEvent) ContainerKillEvent(org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerKillEvent) ByteString(com.google.protobuf.ByteString) IOException(java.io.IOException) YarnException(org.apache.hadoop.yarn.exceptions.YarnException) YarnRuntimeException(org.apache.hadoop.yarn.exceptions.YarnRuntimeException) Container(org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) Application(org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application)

Example 2 with ApplicationFinishEvent

use of org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationFinishEvent in project hadoop by apache.

the class TestContainerManagerRecovery method testApplicationRecovery.

@Test
public void testApplicationRecovery() throws Exception {
    conf.setBoolean(YarnConfiguration.NM_RECOVERY_ENABLED, true);
    conf.setBoolean(YarnConfiguration.NM_RECOVERY_SUPERVISED, true);
    conf.setBoolean(YarnConfiguration.YARN_ACL_ENABLE, true);
    conf.set(YarnConfiguration.YARN_ADMIN_ACL, "yarn_admin_user");
    NMStateStoreService stateStore = new NMMemoryStateStoreService();
    stateStore.init(conf);
    stateStore.start();
    Context context = createContext(conf, stateStore);
    ContainerManagerImpl cm = createContainerManager(context);
    cm.init(conf);
    cm.start();
    // add an application by starting a container
    String appUser = "app_user1";
    String modUser = "modify_user1";
    String viewUser = "view_user1";
    String enemyUser = "enemy_user";
    ApplicationId appId = ApplicationId.newInstance(0, 1);
    ApplicationAttemptId attemptId = ApplicationAttemptId.newInstance(appId, 1);
    ContainerId cid = ContainerId.newContainerId(attemptId, 1);
    Map<String, LocalResource> localResources = Collections.emptyMap();
    Map<String, String> containerEnv = Collections.emptyMap();
    List<String> containerCmds = Collections.emptyList();
    Map<String, ByteBuffer> serviceData = Collections.emptyMap();
    Credentials containerCreds = new Credentials();
    DataOutputBuffer dob = new DataOutputBuffer();
    containerCreds.writeTokenStorageToStream(dob);
    ByteBuffer containerTokens = ByteBuffer.wrap(dob.getData(), 0, dob.getLength());
    Map<ApplicationAccessType, String> acls = new HashMap<ApplicationAccessType, String>();
    acls.put(ApplicationAccessType.MODIFY_APP, modUser);
    acls.put(ApplicationAccessType.VIEW_APP, viewUser);
    ContainerLaunchContext clc = ContainerLaunchContext.newInstance(localResources, containerEnv, containerCmds, serviceData, containerTokens, acls);
    // create the logAggregationContext
    LogAggregationContext logAggregationContext = LogAggregationContext.newInstance("includePattern", "excludePattern", "includePatternInRollingAggregation", "excludePatternInRollingAggregation");
    StartContainersResponse startResponse = startContainer(context, cm, cid, clc, logAggregationContext);
    assertTrue(startResponse.getFailedRequests().isEmpty());
    assertEquals(1, context.getApplications().size());
    Application app = context.getApplications().get(appId);
    assertNotNull(app);
    waitForAppState(app, ApplicationState.INITING);
    assertTrue(context.getApplicationACLsManager().checkAccess(UserGroupInformation.createRemoteUser(modUser), ApplicationAccessType.MODIFY_APP, appUser, appId));
    assertFalse(context.getApplicationACLsManager().checkAccess(UserGroupInformation.createRemoteUser(viewUser), ApplicationAccessType.MODIFY_APP, appUser, appId));
    assertTrue(context.getApplicationACLsManager().checkAccess(UserGroupInformation.createRemoteUser(viewUser), ApplicationAccessType.VIEW_APP, appUser, appId));
    assertFalse(context.getApplicationACLsManager().checkAccess(UserGroupInformation.createRemoteUser(enemyUser), ApplicationAccessType.VIEW_APP, appUser, appId));
    // reset container manager and verify app recovered with proper acls
    cm.stop();
    context = createContext(conf, stateStore);
    cm = createContainerManager(context);
    cm.init(conf);
    cm.start();
    assertEquals(1, context.getApplications().size());
    app = context.getApplications().get(appId);
    assertNotNull(app);
    // check whether LogAggregationContext is recovered correctly
    LogAggregationContext recovered = ((ApplicationImpl) app).getLogAggregationContext();
    assertNotNull(recovered);
    assertEquals(logAggregationContext.getIncludePattern(), recovered.getIncludePattern());
    assertEquals(logAggregationContext.getExcludePattern(), recovered.getExcludePattern());
    assertEquals(logAggregationContext.getRolledLogsIncludePattern(), recovered.getRolledLogsIncludePattern());
    assertEquals(logAggregationContext.getRolledLogsExcludePattern(), recovered.getRolledLogsExcludePattern());
    waitForAppState(app, ApplicationState.INITING);
    assertTrue(context.getApplicationACLsManager().checkAccess(UserGroupInformation.createRemoteUser(modUser), ApplicationAccessType.MODIFY_APP, appUser, appId));
    assertFalse(context.getApplicationACLsManager().checkAccess(UserGroupInformation.createRemoteUser(viewUser), ApplicationAccessType.MODIFY_APP, appUser, appId));
    assertTrue(context.getApplicationACLsManager().checkAccess(UserGroupInformation.createRemoteUser(viewUser), ApplicationAccessType.VIEW_APP, appUser, appId));
    assertFalse(context.getApplicationACLsManager().checkAccess(UserGroupInformation.createRemoteUser(enemyUser), ApplicationAccessType.VIEW_APP, appUser, appId));
    // simulate application completion
    List<ApplicationId> finishedApps = new ArrayList<ApplicationId>();
    finishedApps.add(appId);
    app.handle(new ApplicationFinishEvent(appId, "Application killed by ResourceManager"));
    waitForAppState(app, ApplicationState.APPLICATION_RESOURCES_CLEANINGUP);
    // restart and verify app is marked for finishing
    cm.stop();
    context = createContext(conf, stateStore);
    cm = createContainerManager(context);
    cm.init(conf);
    cm.start();
    assertEquals(1, context.getApplications().size());
    app = context.getApplications().get(appId);
    assertNotNull(app);
    // no longer saving FINISH_APP event in NM stateStore,
    // simulate by resending FINISH_APP event
    app.handle(new ApplicationFinishEvent(appId, "Application killed by ResourceManager"));
    waitForAppState(app, ApplicationState.APPLICATION_RESOURCES_CLEANINGUP);
    assertTrue(context.getApplicationACLsManager().checkAccess(UserGroupInformation.createRemoteUser(modUser), ApplicationAccessType.MODIFY_APP, appUser, appId));
    assertFalse(context.getApplicationACLsManager().checkAccess(UserGroupInformation.createRemoteUser(viewUser), ApplicationAccessType.MODIFY_APP, appUser, appId));
    assertTrue(context.getApplicationACLsManager().checkAccess(UserGroupInformation.createRemoteUser(viewUser), ApplicationAccessType.VIEW_APP, appUser, appId));
    assertFalse(context.getApplicationACLsManager().checkAccess(UserGroupInformation.createRemoteUser(enemyUser), ApplicationAccessType.VIEW_APP, appUser, appId));
    // simulate log aggregation completion
    app.handle(new ApplicationEvent(app.getAppId(), ApplicationEventType.APPLICATION_RESOURCES_CLEANEDUP));
    assertEquals(app.getApplicationState(), ApplicationState.FINISHED);
    app.handle(new ApplicationEvent(app.getAppId(), ApplicationEventType.APPLICATION_LOG_HANDLING_FINISHED));
    // restart and verify app is no longer present after recovery
    cm.stop();
    context = createContext(conf, stateStore);
    cm = createContainerManager(context);
    cm.init(conf);
    cm.start();
    assertTrue(context.getApplications().isEmpty());
    cm.stop();
}
Also used : ApplicationFinishEvent(org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationFinishEvent) HashMap(java.util.HashMap) ApplicationImpl(org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationImpl) ArrayList(java.util.ArrayList) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) DataOutputBuffer(org.apache.hadoop.io.DataOutputBuffer) NMMemoryStateStoreService(org.apache.hadoop.yarn.server.nodemanager.recovery.NMMemoryStateStoreService) FileContext(org.apache.hadoop.fs.FileContext) NMContext(org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext) ContainerLaunchContext(org.apache.hadoop.yarn.api.records.ContainerLaunchContext) LogAggregationContext(org.apache.hadoop.yarn.api.records.LogAggregationContext) Context(org.apache.hadoop.yarn.server.nodemanager.Context) StartContainersResponse(org.apache.hadoop.yarn.api.protocolrecords.StartContainersResponse) ApplicationEvent(org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationEvent) ApplicationAttemptId(org.apache.hadoop.yarn.api.records.ApplicationAttemptId) ContainerLaunchContext(org.apache.hadoop.yarn.api.records.ContainerLaunchContext) ByteBuffer(java.nio.ByteBuffer) NMStateStoreService(org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService) LocalResource(org.apache.hadoop.yarn.api.records.LocalResource) ApplicationAccessType(org.apache.hadoop.yarn.api.records.ApplicationAccessType) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) Application(org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application) Credentials(org.apache.hadoop.security.Credentials) LogAggregationContext(org.apache.hadoop.yarn.api.records.LogAggregationContext) Test(org.junit.Test)

Example 3 with ApplicationFinishEvent

use of org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationFinishEvent in project hadoop by apache.

the class TestContainerManagerRecovery method testNMRecoveryForAppFinishedWithLogAggregationFailure.

@Test
public void testNMRecoveryForAppFinishedWithLogAggregationFailure() throws Exception {
    conf.setBoolean(YarnConfiguration.NM_RECOVERY_ENABLED, true);
    conf.setBoolean(YarnConfiguration.NM_RECOVERY_SUPERVISED, true);
    NMStateStoreService stateStore = new NMMemoryStateStoreService();
    stateStore.init(conf);
    stateStore.start();
    Context context = createContext(conf, stateStore);
    ContainerManagerImpl cm = createContainerManager(context);
    cm.init(conf);
    cm.start();
    // add an application by starting a container
    ApplicationId appId = ApplicationId.newInstance(0, 1);
    ApplicationAttemptId attemptId = ApplicationAttemptId.newInstance(appId, 1);
    ContainerId cid = ContainerId.newContainerId(attemptId, 1);
    Map<String, LocalResource> localResources = Collections.emptyMap();
    Map<String, String> containerEnv = Collections.emptyMap();
    List<String> containerCmds = Collections.emptyList();
    Map<String, ByteBuffer> serviceData = Collections.emptyMap();
    ContainerLaunchContext clc = ContainerLaunchContext.newInstance(localResources, containerEnv, containerCmds, serviceData, null, null);
    StartContainersResponse startResponse = startContainer(context, cm, cid, clc, null);
    assertTrue(startResponse.getFailedRequests().isEmpty());
    assertEquals(1, context.getApplications().size());
    Application app = context.getApplications().get(appId);
    assertNotNull(app);
    waitForAppState(app, ApplicationState.INITING);
    // simulate application completion
    List<ApplicationId> finishedApps = new ArrayList<ApplicationId>();
    finishedApps.add(appId);
    app.handle(new ApplicationFinishEvent(appId, "Application killed by ResourceManager"));
    waitForAppState(app, ApplicationState.APPLICATION_RESOURCES_CLEANINGUP);
    app.handle(new ApplicationEvent(app.getAppId(), ApplicationEventType.APPLICATION_RESOURCES_CLEANEDUP));
    assertEquals(app.getApplicationState(), ApplicationState.FINISHED);
    // application is still in NM context.
    assertEquals(1, context.getApplications().size());
    // restart and verify app is still there and marked as finished.
    cm.stop();
    context = createContext(conf, stateStore);
    cm = createContainerManager(context);
    cm.init(conf);
    cm.start();
    assertEquals(1, context.getApplications().size());
    app = context.getApplications().get(appId);
    assertNotNull(app);
    // no longer saving FINISH_APP event in NM stateStore,
    // simulate by resending FINISH_APP event
    app.handle(new ApplicationFinishEvent(appId, "Application killed by ResourceManager"));
    waitForAppState(app, ApplicationState.APPLICATION_RESOURCES_CLEANINGUP);
    // TODO need to figure out why additional APPLICATION_RESOURCES_CLEANEDUP
    // is needed.
    app.handle(new ApplicationEvent(app.getAppId(), ApplicationEventType.APPLICATION_RESOURCES_CLEANEDUP));
    assertEquals(app.getApplicationState(), ApplicationState.FINISHED);
    // simulate log aggregation failed.
    app.handle(new ApplicationEvent(app.getAppId(), ApplicationEventType.APPLICATION_LOG_HANDLING_FAILED));
    // restart and verify app is no longer present after recovery
    cm.stop();
    context = createContext(conf, stateStore);
    cm = createContainerManager(context);
    cm.init(conf);
    cm.start();
    assertTrue(context.getApplications().isEmpty());
    cm.stop();
}
Also used : FileContext(org.apache.hadoop.fs.FileContext) NMContext(org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext) ContainerLaunchContext(org.apache.hadoop.yarn.api.records.ContainerLaunchContext) LogAggregationContext(org.apache.hadoop.yarn.api.records.LogAggregationContext) Context(org.apache.hadoop.yarn.server.nodemanager.Context) ApplicationFinishEvent(org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationFinishEvent) StartContainersResponse(org.apache.hadoop.yarn.api.protocolrecords.StartContainersResponse) ArrayList(java.util.ArrayList) ApplicationEvent(org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationEvent) ApplicationAttemptId(org.apache.hadoop.yarn.api.records.ApplicationAttemptId) ContainerLaunchContext(org.apache.hadoop.yarn.api.records.ContainerLaunchContext) ByteBuffer(java.nio.ByteBuffer) NMStateStoreService(org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService) LocalResource(org.apache.hadoop.yarn.api.records.LocalResource) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) Application(org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application) NMMemoryStateStoreService(org.apache.hadoop.yarn.server.nodemanager.recovery.NMMemoryStateStoreService) Test(org.junit.Test)

Aggregations

ApplicationId (org.apache.hadoop.yarn.api.records.ApplicationId)3 ContainerId (org.apache.hadoop.yarn.api.records.ContainerId)3 Application (org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application)3 ApplicationFinishEvent (org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationFinishEvent)3 ByteBuffer (java.nio.ByteBuffer)2 ArrayList (java.util.ArrayList)2 FileContext (org.apache.hadoop.fs.FileContext)2 StartContainersResponse (org.apache.hadoop.yarn.api.protocolrecords.StartContainersResponse)2 ApplicationAttemptId (org.apache.hadoop.yarn.api.records.ApplicationAttemptId)2 ContainerLaunchContext (org.apache.hadoop.yarn.api.records.ContainerLaunchContext)2 LocalResource (org.apache.hadoop.yarn.api.records.LocalResource)2 LogAggregationContext (org.apache.hadoop.yarn.api.records.LogAggregationContext)2 Context (org.apache.hadoop.yarn.server.nodemanager.Context)2 NMContext (org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext)2 ApplicationEvent (org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationEvent)2 NMMemoryStateStoreService (org.apache.hadoop.yarn.server.nodemanager.recovery.NMMemoryStateStoreService)2 NMStateStoreService (org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService)2 Test (org.junit.Test)2 ByteString (com.google.protobuf.ByteString)1 IOException (java.io.IOException)1