use of org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService in project hadoop by apache.
the class TestNodeStatusUpdater method testCompletedContainerStatusBackup.
/**
* Test completed containerStatus get back up when heart beat lost, and will
* be sent via next heart beat.
*/
@Test(timeout = 200000)
public void testCompletedContainerStatusBackup() throws Exception {
nm = new NodeManager() {
@Override
protected NodeStatusUpdater createNodeStatusUpdater(Context context, Dispatcher dispatcher, NodeHealthCheckerService healthChecker) {
MyNodeStatusUpdater2 myNodeStatusUpdater = new MyNodeStatusUpdater2(context, dispatcher, healthChecker, metrics);
return myNodeStatusUpdater;
}
@Override
protected NMContext createNMContext(NMContainerTokenSecretManager containerTokenSecretManager, NMTokenSecretManagerInNM nmTokenSecretManager, NMStateStoreService store, boolean isDistributedSchedulingEnabled, Configuration config) {
return new MyNMContext(containerTokenSecretManager, nmTokenSecretManager, config);
}
};
YarnConfiguration conf = createNMConfig();
nm.init(conf);
nm.start();
int waitCount = 0;
while (heartBeatID <= 4 && waitCount++ != 20) {
Thread.sleep(500);
}
if (heartBeatID <= 4) {
Assert.fail("Failed to get all heartbeats in time, " + "heartbeatID:" + heartBeatID);
}
if (assertionFailedInThread.get()) {
Assert.fail("ContainerStatus Backup failed");
}
Assert.assertNotNull(nm.getNMContext().getSystemCredentialsForApps().get(ApplicationId.newInstance(1234, 1)).getToken(new Text("token1")));
nm.stop();
}
use of org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService in project hadoop by apache.
the class TestContainerManagerRecovery method testContainerResizeRecovery.
@Test
public void testContainerResizeRecovery() throws Exception {
conf.setBoolean(YarnConfiguration.NM_RECOVERY_ENABLED, true);
conf.setBoolean(YarnConfiguration.NM_RECOVERY_SUPERVISED, true);
NMStateStoreService stateStore = new NMMemoryStateStoreService();
stateStore.init(conf);
stateStore.start();
Context context = createContext(conf, stateStore);
ContainerManagerImpl cm = createContainerManager(context, delSrvc);
cm.init(conf);
cm.start();
// add an application by starting a container
ApplicationId appId = ApplicationId.newInstance(0, 1);
ApplicationAttemptId attemptId = ApplicationAttemptId.newInstance(appId, 1);
ContainerId cid = ContainerId.newContainerId(attemptId, 1);
Map<String, String> containerEnv = Collections.emptyMap();
Map<String, ByteBuffer> serviceData = Collections.emptyMap();
Credentials containerCreds = new Credentials();
DataOutputBuffer dob = new DataOutputBuffer();
containerCreds.writeTokenStorageToStream(dob);
ByteBuffer containerTokens = ByteBuffer.wrap(dob.getData(), 0, dob.getLength());
Map<ApplicationAccessType, String> acls = Collections.emptyMap();
File tmpDir = new File("target", this.getClass().getSimpleName() + "-tmpDir");
File scriptFile = Shell.appendScriptExtension(tmpDir, "scriptFile");
PrintWriter fileWriter = new PrintWriter(scriptFile);
if (Shell.WINDOWS) {
fileWriter.println("@ping -n 100 127.0.0.1 >nul");
} else {
fileWriter.write("\numask 0");
fileWriter.write("\nexec sleep 100");
}
fileWriter.close();
FileContext localFS = FileContext.getLocalFSFileContext();
URL resource_alpha = URL.fromPath(localFS.makeQualified(new Path(scriptFile.getAbsolutePath())));
LocalResource rsrc_alpha = RecordFactoryProvider.getRecordFactory(null).newRecordInstance(LocalResource.class);
rsrc_alpha.setResource(resource_alpha);
rsrc_alpha.setSize(-1);
rsrc_alpha.setVisibility(LocalResourceVisibility.APPLICATION);
rsrc_alpha.setType(LocalResourceType.FILE);
rsrc_alpha.setTimestamp(scriptFile.lastModified());
String destinationFile = "dest_file";
Map<String, LocalResource> localResources = new HashMap<>();
localResources.put(destinationFile, rsrc_alpha);
List<String> commands = Arrays.asList(Shell.getRunScriptCommand(scriptFile));
ContainerLaunchContext clc = ContainerLaunchContext.newInstance(localResources, containerEnv, commands, serviceData, containerTokens, acls);
StartContainersResponse startResponse = startContainer(context, cm, cid, clc, null);
assertTrue(startResponse.getFailedRequests().isEmpty());
assertEquals(1, context.getApplications().size());
Application app = context.getApplications().get(appId);
assertNotNull(app);
// make sure the container reaches RUNNING state
waitForNMContainerState(cm, cid, org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState.RUNNING);
Resource targetResource = Resource.newInstance(2048, 2);
IncreaseContainersResourceResponse increaseResponse = increaseContainersResource(context, cm, cid, targetResource);
assertTrue(increaseResponse.getFailedRequests().isEmpty());
// check status
ContainerStatus containerStatus = getContainerStatus(context, cm, cid);
assertEquals(targetResource, containerStatus.getCapability());
// restart and verify container is running and recovered
// to the correct size
cm.stop();
context = createContext(conf, stateStore);
cm = createContainerManager(context);
cm.init(conf);
cm.start();
assertEquals(1, context.getApplications().size());
app = context.getApplications().get(appId);
assertNotNull(app);
containerStatus = getContainerStatus(context, cm, cid);
assertEquals(targetResource, containerStatus.getCapability());
}
use of org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService in project hadoop by apache.
the class TestContainerManagerRecovery method testNMRecoveryForAppFinishedWithLogAggregationFailure.
@Test
public void testNMRecoveryForAppFinishedWithLogAggregationFailure() throws Exception {
conf.setBoolean(YarnConfiguration.NM_RECOVERY_ENABLED, true);
conf.setBoolean(YarnConfiguration.NM_RECOVERY_SUPERVISED, true);
NMStateStoreService stateStore = new NMMemoryStateStoreService();
stateStore.init(conf);
stateStore.start();
Context context = createContext(conf, stateStore);
ContainerManagerImpl cm = createContainerManager(context);
cm.init(conf);
cm.start();
// add an application by starting a container
ApplicationId appId = ApplicationId.newInstance(0, 1);
ApplicationAttemptId attemptId = ApplicationAttemptId.newInstance(appId, 1);
ContainerId cid = ContainerId.newContainerId(attemptId, 1);
Map<String, LocalResource> localResources = Collections.emptyMap();
Map<String, String> containerEnv = Collections.emptyMap();
List<String> containerCmds = Collections.emptyList();
Map<String, ByteBuffer> serviceData = Collections.emptyMap();
ContainerLaunchContext clc = ContainerLaunchContext.newInstance(localResources, containerEnv, containerCmds, serviceData, null, null);
StartContainersResponse startResponse = startContainer(context, cm, cid, clc, null);
assertTrue(startResponse.getFailedRequests().isEmpty());
assertEquals(1, context.getApplications().size());
Application app = context.getApplications().get(appId);
assertNotNull(app);
waitForAppState(app, ApplicationState.INITING);
// simulate application completion
List<ApplicationId> finishedApps = new ArrayList<ApplicationId>();
finishedApps.add(appId);
app.handle(new ApplicationFinishEvent(appId, "Application killed by ResourceManager"));
waitForAppState(app, ApplicationState.APPLICATION_RESOURCES_CLEANINGUP);
app.handle(new ApplicationEvent(app.getAppId(), ApplicationEventType.APPLICATION_RESOURCES_CLEANEDUP));
assertEquals(app.getApplicationState(), ApplicationState.FINISHED);
// application is still in NM context.
assertEquals(1, context.getApplications().size());
// restart and verify app is still there and marked as finished.
cm.stop();
context = createContext(conf, stateStore);
cm = createContainerManager(context);
cm.init(conf);
cm.start();
assertEquals(1, context.getApplications().size());
app = context.getApplications().get(appId);
assertNotNull(app);
// no longer saving FINISH_APP event in NM stateStore,
// simulate by resending FINISH_APP event
app.handle(new ApplicationFinishEvent(appId, "Application killed by ResourceManager"));
waitForAppState(app, ApplicationState.APPLICATION_RESOURCES_CLEANINGUP);
// TODO need to figure out why additional APPLICATION_RESOURCES_CLEANEDUP
// is needed.
app.handle(new ApplicationEvent(app.getAppId(), ApplicationEventType.APPLICATION_RESOURCES_CLEANEDUP));
assertEquals(app.getApplicationState(), ApplicationState.FINISHED);
// simulate log aggregation failed.
app.handle(new ApplicationEvent(app.getAppId(), ApplicationEventType.APPLICATION_LOG_HANDLING_FAILED));
// restart and verify app is no longer present after recovery
cm.stop();
context = createContext(conf, stateStore);
cm = createContainerManager(context);
cm.init(conf);
cm.start();
assertTrue(context.getApplications().isEmpty());
cm.stop();
}
use of org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService in project hadoop by apache.
the class TestNonAggregatingLogHandler method testRecovery.
@Test
public void testRecovery() throws Exception {
File[] localLogDirs = getLocalLogDirFiles(this.getClass().getName(), 2);
String localLogDirsString = localLogDirs[0].getAbsolutePath() + "," + localLogDirs[1].getAbsolutePath();
conf.set(YarnConfiguration.NM_LOG_DIRS, localLogDirsString);
conf.setBoolean(YarnConfiguration.LOG_AGGREGATION_ENABLED, false);
conf.setLong(YarnConfiguration.NM_LOG_RETAIN_SECONDS, YarnConfiguration.DEFAULT_NM_LOG_RETAIN_SECONDS);
dirsHandler.init(conf);
appEventHandler.resetLogHandlingEvent();
assertFalse(appEventHandler.receiveLogHandlingFinishEvent());
NMStateStoreService stateStore = new NMMemoryStateStoreService();
stateStore.init(conf);
stateStore.start();
NonAggregatingLogHandlerWithMockExecutor logHandler = new NonAggregatingLogHandlerWithMockExecutor(dispatcher, mockDelService, dirsHandler, stateStore);
logHandler.init(conf);
logHandler.start();
logHandler.handle(new LogHandlerAppStartedEvent(appId, user, null, null));
logHandler.handle(new LogHandlerContainerFinishedEvent(container11, 0));
logHandler.handle(new LogHandlerAppFinishedEvent(appId));
// simulate a restart and verify deletion is rescheduled
logHandler.close();
logHandler = new NonAggregatingLogHandlerWithMockExecutor(dispatcher, mockDelService, dirsHandler, stateStore);
logHandler.init(conf);
logHandler.start();
ArgumentCaptor<Runnable> schedArg = ArgumentCaptor.forClass(Runnable.class);
verify(logHandler.mockSched).schedule(schedArg.capture(), anyLong(), eq(TimeUnit.MILLISECONDS));
// execute the runnable and verify another restart has nothing scheduled
schedArg.getValue().run();
logHandler.close();
logHandler = new NonAggregatingLogHandlerWithMockExecutor(dispatcher, mockDelService, dirsHandler, stateStore);
logHandler.init(conf);
logHandler.start();
verify(logHandler.mockSched, never()).schedule(any(Runnable.class), anyLong(), any(TimeUnit.class));
// wait events get drained.
this.dispatcher.await();
assertTrue(appEventHandler.receiveLogHandlingFinishEvent());
appEventHandler.resetLogHandlingEvent();
assertFalse(appEventHandler.receiveLogHandlingFailedEvent());
// send an app finish event against a removed app
logHandler.handle(new LogHandlerAppFinishedEvent(appId));
this.dispatcher.await();
// verify to receive a log failed event.
assertTrue(appEventHandler.receiveLogHandlingFailedEvent());
assertFalse(appEventHandler.receiveLogHandlingFinishEvent());
logHandler.close();
}
use of org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService in project hadoop by apache.
the class TestResourceLocalizationService method testDirectoryCleanupOnNewlyCreatedStateStore.
@Test
public void testDirectoryCleanupOnNewlyCreatedStateStore() throws IOException, URISyntaxException {
conf.set(CommonConfigurationKeys.FS_PERMISSIONS_UMASK_KEY, "077");
AsyncDispatcher dispatcher = new AsyncDispatcher();
dispatcher.init(new Configuration());
ContainerExecutor exec = mock(ContainerExecutor.class);
DeletionService delService = spy(new DeletionService(exec));
delService.init(conf);
delService.start();
List<Path> localDirs = new ArrayList<Path>();
String[] sDirs = new String[4];
for (int i = 0; i < 4; ++i) {
localDirs.add(lfs.makeQualified(new Path(basedir, i + "")));
sDirs[i] = localDirs.get(i).toString();
}
conf.setStrings(YarnConfiguration.NM_LOCAL_DIRS, sDirs);
LocalDirsHandlerService diskhandler = new LocalDirsHandlerService();
diskhandler.init(conf);
NMStateStoreService nmStateStoreService = mock(NMStateStoreService.class);
when(nmStateStoreService.canRecover()).thenReturn(true);
when(nmStateStoreService.isNewlyCreated()).thenReturn(true);
ResourceLocalizationService locService = spy(new ResourceLocalizationService(dispatcher, exec, delService, diskhandler, nmContext));
doReturn(lfs).when(locService).getLocalFileContext(isA(Configuration.class));
try {
dispatcher.start();
// initialize ResourceLocalizationService
locService.init(conf);
final FsPermission defaultPerm = new FsPermission((short) 0755);
// verify directory creation
for (Path p : localDirs) {
p = new Path((new URI(p.toString())).getPath());
Path usercache = new Path(p, ContainerLocalizer.USERCACHE);
verify(spylfs).rename(eq(usercache), any(Path.class), any(Options.Rename.class));
verify(spylfs).mkdir(eq(usercache), eq(defaultPerm), eq(true));
Path publicCache = new Path(p, ContainerLocalizer.FILECACHE);
verify(spylfs).rename(eq(usercache), any(Path.class), any(Options.Rename.class));
verify(spylfs).mkdir(eq(publicCache), eq(defaultPerm), eq(true));
Path nmPriv = new Path(p, ResourceLocalizationService.NM_PRIVATE_DIR);
verify(spylfs).rename(eq(usercache), any(Path.class), any(Options.Rename.class));
verify(spylfs).mkdir(eq(nmPriv), eq(ResourceLocalizationService.NM_PRIVATE_PERM), eq(true));
}
} finally {
dispatcher.stop();
delService.stop();
}
}
Aggregations