Search in sources :

Example 6 with NMStateStoreService

use of org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService in project hadoop by apache.

the class TestNodeStatusUpdater method testCompletedContainerStatusBackup.

/**
   * Test completed containerStatus get back up when heart beat lost, and will
   * be sent via next heart beat.
   */
@Test(timeout = 200000)
public void testCompletedContainerStatusBackup() throws Exception {
    nm = new NodeManager() {

        @Override
        protected NodeStatusUpdater createNodeStatusUpdater(Context context, Dispatcher dispatcher, NodeHealthCheckerService healthChecker) {
            MyNodeStatusUpdater2 myNodeStatusUpdater = new MyNodeStatusUpdater2(context, dispatcher, healthChecker, metrics);
            return myNodeStatusUpdater;
        }

        @Override
        protected NMContext createNMContext(NMContainerTokenSecretManager containerTokenSecretManager, NMTokenSecretManagerInNM nmTokenSecretManager, NMStateStoreService store, boolean isDistributedSchedulingEnabled, Configuration config) {
            return new MyNMContext(containerTokenSecretManager, nmTokenSecretManager, config);
        }
    };
    YarnConfiguration conf = createNMConfig();
    nm.init(conf);
    nm.start();
    int waitCount = 0;
    while (heartBeatID <= 4 && waitCount++ != 20) {
        Thread.sleep(500);
    }
    if (heartBeatID <= 4) {
        Assert.fail("Failed to get all heartbeats in time, " + "heartbeatID:" + heartBeatID);
    }
    if (assertionFailedInThread.get()) {
        Assert.fail("ContainerStatus Backup failed");
    }
    Assert.assertNotNull(nm.getNMContext().getSystemCredentialsForApps().get(ApplicationId.newInstance(1234, 1)).getToken(new Text("token1")));
    nm.stop();
}
Also used : FileContext(org.apache.hadoop.fs.FileContext) NMContext(org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext) ContainerLaunchContext(org.apache.hadoop.yarn.api.records.ContainerLaunchContext) Configuration(org.apache.hadoop.conf.Configuration) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) NMTokenSecretManagerInNM(org.apache.hadoop.yarn.server.nodemanager.security.NMTokenSecretManagerInNM) Text(org.apache.hadoop.io.Text) Dispatcher(org.apache.hadoop.yarn.event.Dispatcher) NMStateStoreService(org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService) NMContext(org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) NMContainerTokenSecretManager(org.apache.hadoop.yarn.server.nodemanager.security.NMContainerTokenSecretManager) Test(org.junit.Test)

Example 7 with NMStateStoreService

use of org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService in project hadoop by apache.

the class TestContainerManagerRecovery method testContainerResizeRecovery.

@Test
public void testContainerResizeRecovery() throws Exception {
    conf.setBoolean(YarnConfiguration.NM_RECOVERY_ENABLED, true);
    conf.setBoolean(YarnConfiguration.NM_RECOVERY_SUPERVISED, true);
    NMStateStoreService stateStore = new NMMemoryStateStoreService();
    stateStore.init(conf);
    stateStore.start();
    Context context = createContext(conf, stateStore);
    ContainerManagerImpl cm = createContainerManager(context, delSrvc);
    cm.init(conf);
    cm.start();
    // add an application by starting a container
    ApplicationId appId = ApplicationId.newInstance(0, 1);
    ApplicationAttemptId attemptId = ApplicationAttemptId.newInstance(appId, 1);
    ContainerId cid = ContainerId.newContainerId(attemptId, 1);
    Map<String, String> containerEnv = Collections.emptyMap();
    Map<String, ByteBuffer> serviceData = Collections.emptyMap();
    Credentials containerCreds = new Credentials();
    DataOutputBuffer dob = new DataOutputBuffer();
    containerCreds.writeTokenStorageToStream(dob);
    ByteBuffer containerTokens = ByteBuffer.wrap(dob.getData(), 0, dob.getLength());
    Map<ApplicationAccessType, String> acls = Collections.emptyMap();
    File tmpDir = new File("target", this.getClass().getSimpleName() + "-tmpDir");
    File scriptFile = Shell.appendScriptExtension(tmpDir, "scriptFile");
    PrintWriter fileWriter = new PrintWriter(scriptFile);
    if (Shell.WINDOWS) {
        fileWriter.println("@ping -n 100 127.0.0.1 >nul");
    } else {
        fileWriter.write("\numask 0");
        fileWriter.write("\nexec sleep 100");
    }
    fileWriter.close();
    FileContext localFS = FileContext.getLocalFSFileContext();
    URL resource_alpha = URL.fromPath(localFS.makeQualified(new Path(scriptFile.getAbsolutePath())));
    LocalResource rsrc_alpha = RecordFactoryProvider.getRecordFactory(null).newRecordInstance(LocalResource.class);
    rsrc_alpha.setResource(resource_alpha);
    rsrc_alpha.setSize(-1);
    rsrc_alpha.setVisibility(LocalResourceVisibility.APPLICATION);
    rsrc_alpha.setType(LocalResourceType.FILE);
    rsrc_alpha.setTimestamp(scriptFile.lastModified());
    String destinationFile = "dest_file";
    Map<String, LocalResource> localResources = new HashMap<>();
    localResources.put(destinationFile, rsrc_alpha);
    List<String> commands = Arrays.asList(Shell.getRunScriptCommand(scriptFile));
    ContainerLaunchContext clc = ContainerLaunchContext.newInstance(localResources, containerEnv, commands, serviceData, containerTokens, acls);
    StartContainersResponse startResponse = startContainer(context, cm, cid, clc, null);
    assertTrue(startResponse.getFailedRequests().isEmpty());
    assertEquals(1, context.getApplications().size());
    Application app = context.getApplications().get(appId);
    assertNotNull(app);
    // make sure the container reaches RUNNING state
    waitForNMContainerState(cm, cid, org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState.RUNNING);
    Resource targetResource = Resource.newInstance(2048, 2);
    IncreaseContainersResourceResponse increaseResponse = increaseContainersResource(context, cm, cid, targetResource);
    assertTrue(increaseResponse.getFailedRequests().isEmpty());
    // check status
    ContainerStatus containerStatus = getContainerStatus(context, cm, cid);
    assertEquals(targetResource, containerStatus.getCapability());
    // restart and verify container is running and recovered
    // to the correct size
    cm.stop();
    context = createContext(conf, stateStore);
    cm = createContainerManager(context);
    cm.init(conf);
    cm.start();
    assertEquals(1, context.getApplications().size());
    app = context.getApplications().get(appId);
    assertNotNull(app);
    containerStatus = getContainerStatus(context, cm, cid);
    assertEquals(targetResource, containerStatus.getCapability());
}
Also used : HashMap(java.util.HashMap) URL(org.apache.hadoop.yarn.api.records.URL) ContainerStatus(org.apache.hadoop.yarn.api.records.ContainerStatus) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) DataOutputBuffer(org.apache.hadoop.io.DataOutputBuffer) IncreaseContainersResourceResponse(org.apache.hadoop.yarn.api.protocolrecords.IncreaseContainersResourceResponse) NMMemoryStateStoreService(org.apache.hadoop.yarn.server.nodemanager.recovery.NMMemoryStateStoreService) PrintWriter(java.io.PrintWriter) FileContext(org.apache.hadoop.fs.FileContext) NMContext(org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext) ContainerLaunchContext(org.apache.hadoop.yarn.api.records.ContainerLaunchContext) LogAggregationContext(org.apache.hadoop.yarn.api.records.LogAggregationContext) Context(org.apache.hadoop.yarn.server.nodemanager.Context) Path(org.apache.hadoop.fs.Path) StartContainersResponse(org.apache.hadoop.yarn.api.protocolrecords.StartContainersResponse) Resource(org.apache.hadoop.yarn.api.records.Resource) LocalResource(org.apache.hadoop.yarn.api.records.LocalResource) ApplicationAttemptId(org.apache.hadoop.yarn.api.records.ApplicationAttemptId) ContainerLaunchContext(org.apache.hadoop.yarn.api.records.ContainerLaunchContext) ByteBuffer(java.nio.ByteBuffer) NMStateStoreService(org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService) LocalResource(org.apache.hadoop.yarn.api.records.LocalResource) ApplicationAccessType(org.apache.hadoop.yarn.api.records.ApplicationAccessType) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) File(java.io.File) Application(org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application) Credentials(org.apache.hadoop.security.Credentials) FileContext(org.apache.hadoop.fs.FileContext) Test(org.junit.Test)

Example 8 with NMStateStoreService

use of org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService in project hadoop by apache.

the class TestContainerManagerRecovery method testNMRecoveryForAppFinishedWithLogAggregationFailure.

@Test
public void testNMRecoveryForAppFinishedWithLogAggregationFailure() throws Exception {
    conf.setBoolean(YarnConfiguration.NM_RECOVERY_ENABLED, true);
    conf.setBoolean(YarnConfiguration.NM_RECOVERY_SUPERVISED, true);
    NMStateStoreService stateStore = new NMMemoryStateStoreService();
    stateStore.init(conf);
    stateStore.start();
    Context context = createContext(conf, stateStore);
    ContainerManagerImpl cm = createContainerManager(context);
    cm.init(conf);
    cm.start();
    // add an application by starting a container
    ApplicationId appId = ApplicationId.newInstance(0, 1);
    ApplicationAttemptId attemptId = ApplicationAttemptId.newInstance(appId, 1);
    ContainerId cid = ContainerId.newContainerId(attemptId, 1);
    Map<String, LocalResource> localResources = Collections.emptyMap();
    Map<String, String> containerEnv = Collections.emptyMap();
    List<String> containerCmds = Collections.emptyList();
    Map<String, ByteBuffer> serviceData = Collections.emptyMap();
    ContainerLaunchContext clc = ContainerLaunchContext.newInstance(localResources, containerEnv, containerCmds, serviceData, null, null);
    StartContainersResponse startResponse = startContainer(context, cm, cid, clc, null);
    assertTrue(startResponse.getFailedRequests().isEmpty());
    assertEquals(1, context.getApplications().size());
    Application app = context.getApplications().get(appId);
    assertNotNull(app);
    waitForAppState(app, ApplicationState.INITING);
    // simulate application completion
    List<ApplicationId> finishedApps = new ArrayList<ApplicationId>();
    finishedApps.add(appId);
    app.handle(new ApplicationFinishEvent(appId, "Application killed by ResourceManager"));
    waitForAppState(app, ApplicationState.APPLICATION_RESOURCES_CLEANINGUP);
    app.handle(new ApplicationEvent(app.getAppId(), ApplicationEventType.APPLICATION_RESOURCES_CLEANEDUP));
    assertEquals(app.getApplicationState(), ApplicationState.FINISHED);
    // application is still in NM context.
    assertEquals(1, context.getApplications().size());
    // restart and verify app is still there and marked as finished.
    cm.stop();
    context = createContext(conf, stateStore);
    cm = createContainerManager(context);
    cm.init(conf);
    cm.start();
    assertEquals(1, context.getApplications().size());
    app = context.getApplications().get(appId);
    assertNotNull(app);
    // no longer saving FINISH_APP event in NM stateStore,
    // simulate by resending FINISH_APP event
    app.handle(new ApplicationFinishEvent(appId, "Application killed by ResourceManager"));
    waitForAppState(app, ApplicationState.APPLICATION_RESOURCES_CLEANINGUP);
    // TODO need to figure out why additional APPLICATION_RESOURCES_CLEANEDUP
    // is needed.
    app.handle(new ApplicationEvent(app.getAppId(), ApplicationEventType.APPLICATION_RESOURCES_CLEANEDUP));
    assertEquals(app.getApplicationState(), ApplicationState.FINISHED);
    // simulate log aggregation failed.
    app.handle(new ApplicationEvent(app.getAppId(), ApplicationEventType.APPLICATION_LOG_HANDLING_FAILED));
    // restart and verify app is no longer present after recovery
    cm.stop();
    context = createContext(conf, stateStore);
    cm = createContainerManager(context);
    cm.init(conf);
    cm.start();
    assertTrue(context.getApplications().isEmpty());
    cm.stop();
}
Also used : FileContext(org.apache.hadoop.fs.FileContext) NMContext(org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext) ContainerLaunchContext(org.apache.hadoop.yarn.api.records.ContainerLaunchContext) LogAggregationContext(org.apache.hadoop.yarn.api.records.LogAggregationContext) Context(org.apache.hadoop.yarn.server.nodemanager.Context) ApplicationFinishEvent(org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationFinishEvent) StartContainersResponse(org.apache.hadoop.yarn.api.protocolrecords.StartContainersResponse) ArrayList(java.util.ArrayList) ApplicationEvent(org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationEvent) ApplicationAttemptId(org.apache.hadoop.yarn.api.records.ApplicationAttemptId) ContainerLaunchContext(org.apache.hadoop.yarn.api.records.ContainerLaunchContext) ByteBuffer(java.nio.ByteBuffer) NMStateStoreService(org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService) LocalResource(org.apache.hadoop.yarn.api.records.LocalResource) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) Application(org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application) NMMemoryStateStoreService(org.apache.hadoop.yarn.server.nodemanager.recovery.NMMemoryStateStoreService) Test(org.junit.Test)

Example 9 with NMStateStoreService

use of org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService in project hadoop by apache.

the class TestNonAggregatingLogHandler method testRecovery.

@Test
public void testRecovery() throws Exception {
    File[] localLogDirs = getLocalLogDirFiles(this.getClass().getName(), 2);
    String localLogDirsString = localLogDirs[0].getAbsolutePath() + "," + localLogDirs[1].getAbsolutePath();
    conf.set(YarnConfiguration.NM_LOG_DIRS, localLogDirsString);
    conf.setBoolean(YarnConfiguration.LOG_AGGREGATION_ENABLED, false);
    conf.setLong(YarnConfiguration.NM_LOG_RETAIN_SECONDS, YarnConfiguration.DEFAULT_NM_LOG_RETAIN_SECONDS);
    dirsHandler.init(conf);
    appEventHandler.resetLogHandlingEvent();
    assertFalse(appEventHandler.receiveLogHandlingFinishEvent());
    NMStateStoreService stateStore = new NMMemoryStateStoreService();
    stateStore.init(conf);
    stateStore.start();
    NonAggregatingLogHandlerWithMockExecutor logHandler = new NonAggregatingLogHandlerWithMockExecutor(dispatcher, mockDelService, dirsHandler, stateStore);
    logHandler.init(conf);
    logHandler.start();
    logHandler.handle(new LogHandlerAppStartedEvent(appId, user, null, null));
    logHandler.handle(new LogHandlerContainerFinishedEvent(container11, 0));
    logHandler.handle(new LogHandlerAppFinishedEvent(appId));
    // simulate a restart and verify deletion is rescheduled
    logHandler.close();
    logHandler = new NonAggregatingLogHandlerWithMockExecutor(dispatcher, mockDelService, dirsHandler, stateStore);
    logHandler.init(conf);
    logHandler.start();
    ArgumentCaptor<Runnable> schedArg = ArgumentCaptor.forClass(Runnable.class);
    verify(logHandler.mockSched).schedule(schedArg.capture(), anyLong(), eq(TimeUnit.MILLISECONDS));
    // execute the runnable and verify another restart has nothing scheduled
    schedArg.getValue().run();
    logHandler.close();
    logHandler = new NonAggregatingLogHandlerWithMockExecutor(dispatcher, mockDelService, dirsHandler, stateStore);
    logHandler.init(conf);
    logHandler.start();
    verify(logHandler.mockSched, never()).schedule(any(Runnable.class), anyLong(), any(TimeUnit.class));
    // wait events get drained.
    this.dispatcher.await();
    assertTrue(appEventHandler.receiveLogHandlingFinishEvent());
    appEventHandler.resetLogHandlingEvent();
    assertFalse(appEventHandler.receiveLogHandlingFailedEvent());
    // send an app finish event against a removed app
    logHandler.handle(new LogHandlerAppFinishedEvent(appId));
    this.dispatcher.await();
    // verify to receive a log failed event.
    assertTrue(appEventHandler.receiveLogHandlingFailedEvent());
    assertFalse(appEventHandler.receiveLogHandlingFinishEvent());
    logHandler.close();
}
Also used : LogHandlerAppStartedEvent(org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler.event.LogHandlerAppStartedEvent) LogHandlerAppFinishedEvent(org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler.event.LogHandlerAppFinishedEvent) LogHandlerContainerFinishedEvent(org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler.event.LogHandlerContainerFinishedEvent) TimeUnit(java.util.concurrent.TimeUnit) File(java.io.File) NMStateStoreService(org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService) NMMemoryStateStoreService(org.apache.hadoop.yarn.server.nodemanager.recovery.NMMemoryStateStoreService) Test(org.junit.Test)

Example 10 with NMStateStoreService

use of org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService in project hadoop by apache.

the class TestResourceLocalizationService method testDirectoryCleanupOnNewlyCreatedStateStore.

@Test
public void testDirectoryCleanupOnNewlyCreatedStateStore() throws IOException, URISyntaxException {
    conf.set(CommonConfigurationKeys.FS_PERMISSIONS_UMASK_KEY, "077");
    AsyncDispatcher dispatcher = new AsyncDispatcher();
    dispatcher.init(new Configuration());
    ContainerExecutor exec = mock(ContainerExecutor.class);
    DeletionService delService = spy(new DeletionService(exec));
    delService.init(conf);
    delService.start();
    List<Path> localDirs = new ArrayList<Path>();
    String[] sDirs = new String[4];
    for (int i = 0; i < 4; ++i) {
        localDirs.add(lfs.makeQualified(new Path(basedir, i + "")));
        sDirs[i] = localDirs.get(i).toString();
    }
    conf.setStrings(YarnConfiguration.NM_LOCAL_DIRS, sDirs);
    LocalDirsHandlerService diskhandler = new LocalDirsHandlerService();
    diskhandler.init(conf);
    NMStateStoreService nmStateStoreService = mock(NMStateStoreService.class);
    when(nmStateStoreService.canRecover()).thenReturn(true);
    when(nmStateStoreService.isNewlyCreated()).thenReturn(true);
    ResourceLocalizationService locService = spy(new ResourceLocalizationService(dispatcher, exec, delService, diskhandler, nmContext));
    doReturn(lfs).when(locService).getLocalFileContext(isA(Configuration.class));
    try {
        dispatcher.start();
        // initialize ResourceLocalizationService
        locService.init(conf);
        final FsPermission defaultPerm = new FsPermission((short) 0755);
        // verify directory creation
        for (Path p : localDirs) {
            p = new Path((new URI(p.toString())).getPath());
            Path usercache = new Path(p, ContainerLocalizer.USERCACHE);
            verify(spylfs).rename(eq(usercache), any(Path.class), any(Options.Rename.class));
            verify(spylfs).mkdir(eq(usercache), eq(defaultPerm), eq(true));
            Path publicCache = new Path(p, ContainerLocalizer.FILECACHE);
            verify(spylfs).rename(eq(usercache), any(Path.class), any(Options.Rename.class));
            verify(spylfs).mkdir(eq(publicCache), eq(defaultPerm), eq(true));
            Path nmPriv = new Path(p, ResourceLocalizationService.NM_PRIVATE_DIR);
            verify(spylfs).rename(eq(usercache), any(Path.class), any(Options.Rename.class));
            verify(spylfs).mkdir(eq(nmPriv), eq(ResourceLocalizationService.NM_PRIVATE_PERM), eq(true));
        }
    } finally {
        dispatcher.stop();
        delService.stop();
    }
}
Also used : Path(org.apache.hadoop.fs.Path) ContainerExecutor(org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor) DefaultContainerExecutor(org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor) Configuration(org.apache.hadoop.conf.Configuration) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) DeletionService(org.apache.hadoop.yarn.server.nodemanager.DeletionService) ArrayList(java.util.ArrayList) LocalDirsHandlerService(org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService) URI(java.net.URI) NMStateStoreService(org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService) AsyncDispatcher(org.apache.hadoop.yarn.event.AsyncDispatcher) FsPermission(org.apache.hadoop.fs.permission.FsPermission) Test(org.junit.Test)

Aggregations

NMStateStoreService (org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService)14 Test (org.junit.Test)13 ApplicationId (org.apache.hadoop.yarn.api.records.ApplicationId)10 Configuration (org.apache.hadoop.conf.Configuration)9 YarnConfiguration (org.apache.hadoop.yarn.conf.YarnConfiguration)9 Path (org.apache.hadoop.fs.Path)8 ContainerId (org.apache.hadoop.yarn.api.records.ContainerId)7 DrainDispatcher (org.apache.hadoop.yarn.event.DrainDispatcher)6 FileContext (org.apache.hadoop.fs.FileContext)5 ContainerLaunchContext (org.apache.hadoop.yarn.api.records.ContainerLaunchContext)5 NMContext (org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext)5 Application (org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application)5 LocalizerEvent (org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.LocalizerEvent)5 ArrayList (java.util.ArrayList)4 ApplicationAttemptId (org.apache.hadoop.yarn.api.records.ApplicationAttemptId)4 LocalResource (org.apache.hadoop.yarn.api.records.LocalResource)4 DeletionService (org.apache.hadoop.yarn.server.nodemanager.DeletionService)4 ByteBuffer (java.nio.ByteBuffer)3 HashMap (java.util.HashMap)3 StartContainersResponse (org.apache.hadoop.yarn.api.protocolrecords.StartContainersResponse)3