Search in sources :

Example 1 with RecoveredContainerState

use of org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.RecoveredContainerState in project hadoop by apache.

the class TestNMLeveldbStateStoreService method testContainerStorage.

@Test
public void testContainerStorage() throws IOException {
    // test empty when no state
    List<RecoveredContainerState> recoveredContainers = stateStore.loadContainersState();
    assertTrue(recoveredContainers.isEmpty());
    // create a container request
    ApplicationId appId = ApplicationId.newInstance(1234, 3);
    ApplicationAttemptId appAttemptId = ApplicationAttemptId.newInstance(appId, 4);
    ContainerId containerId = ContainerId.newContainerId(appAttemptId, 5);
    StartContainerRequest containerReq = createContainerRequest(containerId);
    // store a container and verify recovered
    stateStore.storeContainer(containerId, 0, containerReq);
    // verify the container version key is not stored for new containers
    DB db = stateStore.getDB();
    assertNull("version key present for new container", db.get(bytes(stateStore.getContainerVersionKey(containerId.toString()))));
    restartStateStore();
    recoveredContainers = stateStore.loadContainersState();
    assertEquals(1, recoveredContainers.size());
    RecoveredContainerState rcs = recoveredContainers.get(0);
    assertEquals(0, rcs.getVersion());
    assertEquals(RecoveredContainerStatus.REQUESTED, rcs.getStatus());
    assertEquals(ContainerExitStatus.INVALID, rcs.getExitCode());
    assertEquals(false, rcs.getKilled());
    assertEquals(containerReq, rcs.getStartRequest());
    assertTrue(rcs.getDiagnostics().isEmpty());
    // store a new container record without StartContainerRequest
    ContainerId containerId1 = ContainerId.newContainerId(appAttemptId, 6);
    stateStore.storeContainerLaunched(containerId1);
    recoveredContainers = stateStore.loadContainersState();
    // check whether the new container record is discarded
    assertEquals(1, recoveredContainers.size());
    // queue the container, and verify recovered
    stateStore.storeContainerQueued(containerId);
    restartStateStore();
    recoveredContainers = stateStore.loadContainersState();
    assertEquals(1, recoveredContainers.size());
    rcs = recoveredContainers.get(0);
    assertEquals(RecoveredContainerStatus.QUEUED, rcs.getStatus());
    assertEquals(ContainerExitStatus.INVALID, rcs.getExitCode());
    assertEquals(false, rcs.getKilled());
    assertEquals(containerReq, rcs.getStartRequest());
    assertTrue(rcs.getDiagnostics().isEmpty());
    // launch the container, add some diagnostics, and verify recovered
    StringBuilder diags = new StringBuilder();
    stateStore.storeContainerLaunched(containerId);
    diags.append("some diags for container");
    stateStore.storeContainerDiagnostics(containerId, diags);
    restartStateStore();
    recoveredContainers = stateStore.loadContainersState();
    assertEquals(1, recoveredContainers.size());
    rcs = recoveredContainers.get(0);
    assertEquals(RecoveredContainerStatus.LAUNCHED, rcs.getStatus());
    assertEquals(ContainerExitStatus.INVALID, rcs.getExitCode());
    assertEquals(false, rcs.getKilled());
    assertEquals(containerReq, rcs.getStartRequest());
    assertEquals(diags.toString(), rcs.getDiagnostics());
    // increase the container size, and verify recovered
    stateStore.storeContainerResourceChanged(containerId, 2, Resource.newInstance(2468, 4));
    restartStateStore();
    recoveredContainers = stateStore.loadContainersState();
    assertEquals(1, recoveredContainers.size());
    rcs = recoveredContainers.get(0);
    assertEquals(2, rcs.getVersion());
    assertEquals(RecoveredContainerStatus.LAUNCHED, rcs.getStatus());
    assertEquals(ContainerExitStatus.INVALID, rcs.getExitCode());
    assertEquals(false, rcs.getKilled());
    assertEquals(Resource.newInstance(2468, 4), rcs.getCapability());
    // mark the container killed, add some more diags, and verify recovered
    diags.append("some more diags for container");
    stateStore.storeContainerDiagnostics(containerId, diags);
    stateStore.storeContainerKilled(containerId);
    restartStateStore();
    recoveredContainers = stateStore.loadContainersState();
    assertEquals(1, recoveredContainers.size());
    rcs = recoveredContainers.get(0);
    assertEquals(RecoveredContainerStatus.LAUNCHED, rcs.getStatus());
    assertEquals(ContainerExitStatus.INVALID, rcs.getExitCode());
    assertTrue(rcs.getKilled());
    assertEquals(containerReq, rcs.getStartRequest());
    assertEquals(diags.toString(), rcs.getDiagnostics());
    // add yet more diags, mark container completed, and verify recovered
    diags.append("some final diags");
    stateStore.storeContainerDiagnostics(containerId, diags);
    stateStore.storeContainerCompleted(containerId, 21);
    restartStateStore();
    recoveredContainers = stateStore.loadContainersState();
    assertEquals(1, recoveredContainers.size());
    rcs = recoveredContainers.get(0);
    assertEquals(RecoveredContainerStatus.COMPLETED, rcs.getStatus());
    assertEquals(21, rcs.getExitCode());
    assertTrue(rcs.getKilled());
    assertEquals(containerReq, rcs.getStartRequest());
    assertEquals(diags.toString(), rcs.getDiagnostics());
    // store remainingRetryAttempts, workDir and logDir
    stateStore.storeContainerRemainingRetryAttempts(containerId, 6);
    stateStore.storeContainerWorkDir(containerId, "/test/workdir");
    stateStore.storeContainerLogDir(containerId, "/test/logdir");
    restartStateStore();
    recoveredContainers = stateStore.loadContainersState();
    assertEquals(1, recoveredContainers.size());
    rcs = recoveredContainers.get(0);
    assertEquals(6, rcs.getRemainingRetryAttempts());
    assertEquals("/test/workdir", rcs.getWorkDir());
    assertEquals("/test/logdir", rcs.getLogDir());
    // remove the container and verify not recovered
    stateStore.removeContainer(containerId);
    restartStateStore();
    recoveredContainers = stateStore.loadContainersState();
    assertTrue(recoveredContainers.isEmpty());
}
Also used : ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) RecoveredContainerState(org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.RecoveredContainerState) ApplicationAttemptId(org.apache.hadoop.yarn.api.records.ApplicationAttemptId) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) DB(org.iq80.leveldb.DB) StartContainerRequest(org.apache.hadoop.yarn.api.protocolrecords.StartContainerRequest) Test(org.junit.Test)

Example 2 with RecoveredContainerState

use of org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.RecoveredContainerState in project hadoop by apache.

the class ContainerManagerImpl method recover.

@SuppressWarnings("unchecked")
private void recover() throws IOException, URISyntaxException {
    NMStateStoreService stateStore = context.getNMStateStore();
    if (stateStore.canRecover()) {
        rsrcLocalizationSrvc.recoverLocalizedResources(stateStore.loadLocalizationState());
        RecoveredApplicationsState appsState = stateStore.loadApplicationsState();
        for (ContainerManagerApplicationProto proto : appsState.getApplications()) {
            if (LOG.isDebugEnabled()) {
                LOG.debug("Recovering application with state: " + proto.toString());
            }
            recoverApplication(proto);
        }
        for (RecoveredContainerState rcs : stateStore.loadContainersState()) {
            if (LOG.isDebugEnabled()) {
                LOG.debug("Recovering container with state: " + rcs);
            }
            recoverContainer(rcs);
        }
    } else {
        LOG.info("Not a recoverable state store. Nothing to recover.");
    }
}
Also used : RecoveredContainerState(org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.RecoveredContainerState) RecoveredApplicationsState(org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.RecoveredApplicationsState) ContainerManagerApplicationProto(org.apache.hadoop.yarn.proto.YarnServerNodemanagerRecoveryProtos.ContainerManagerApplicationProto) NMStateStoreService(org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService)

Example 3 with RecoveredContainerState

use of org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.RecoveredContainerState in project hadoop by apache.

the class TestNMLeveldbStateStoreService method testUnexpectedKeyDoesntThrowException.

@Test
public void testUnexpectedKeyDoesntThrowException() throws IOException {
    // test empty when no state
    List<RecoveredContainerState> recoveredContainers = stateStore.loadContainersState();
    assertTrue(recoveredContainers.isEmpty());
    // create a container request
    ApplicationId appId = ApplicationId.newInstance(1234, 3);
    ApplicationAttemptId appAttemptId = ApplicationAttemptId.newInstance(appId, 4);
    ContainerId containerId = ContainerId.newContainerId(appAttemptId, 5);
    LocalResource lrsrc = LocalResource.newInstance(URL.newInstance("hdfs", "somehost", 12345, "/some/path/to/rsrc"), LocalResourceType.FILE, LocalResourceVisibility.APPLICATION, 123L, 1234567890L);
    Map<String, LocalResource> localResources = new HashMap<String, LocalResource>();
    localResources.put("rsrc", lrsrc);
    Map<String, String> env = new HashMap<String, String>();
    env.put("somevar", "someval");
    List<String> containerCmds = new ArrayList<String>();
    containerCmds.add("somecmd");
    containerCmds.add("somearg");
    Map<String, ByteBuffer> serviceData = new HashMap<String, ByteBuffer>();
    serviceData.put("someservice", ByteBuffer.wrap(new byte[] { 0x1, 0x2, 0x3 }));
    ByteBuffer containerTokens = ByteBuffer.wrap(new byte[] { 0x7, 0x8, 0x9, 0xa });
    Map<ApplicationAccessType, String> acls = new HashMap<ApplicationAccessType, String>();
    acls.put(ApplicationAccessType.VIEW_APP, "viewuser");
    acls.put(ApplicationAccessType.MODIFY_APP, "moduser");
    ContainerLaunchContext clc = ContainerLaunchContext.newInstance(localResources, env, containerCmds, serviceData, containerTokens, acls);
    Resource containerRsrc = Resource.newInstance(1357, 3);
    ContainerTokenIdentifier containerTokenId = new ContainerTokenIdentifier(containerId, "host", "user", containerRsrc, 9876543210L, 42, 2468, Priority.newInstance(7), 13579);
    Token containerToken = Token.newInstance(containerTokenId.getBytes(), ContainerTokenIdentifier.KIND.toString(), "password".getBytes(), "tokenservice");
    StartContainerRequest containerReq = StartContainerRequest.newInstance(clc, containerToken);
    stateStore.storeContainer(containerId, 0, containerReq);
    // add a invalid key
    byte[] invalidKey = ("ContainerManager/containers/" + containerId.toString() + "/invalidKey1234").getBytes();
    stateStore.getDB().put(invalidKey, new byte[1]);
    restartStateStore();
    recoveredContainers = stateStore.loadContainersState();
    assertEquals(1, recoveredContainers.size());
    RecoveredContainerState rcs = recoveredContainers.get(0);
    assertEquals(RecoveredContainerStatus.REQUESTED, rcs.getStatus());
    assertEquals(ContainerExitStatus.INVALID, rcs.getExitCode());
    assertEquals(false, rcs.getKilled());
    assertEquals(containerReq, rcs.getStartRequest());
    assertTrue(rcs.getDiagnostics().isEmpty());
    assertEquals(RecoveredContainerType.KILL, rcs.getRecoveryType());
    // assert unknown keys are cleaned up finally
    assertNotNull(stateStore.getDB().get(invalidKey));
    stateStore.removeContainer(containerId);
    assertNull(stateStore.getDB().get(invalidKey));
}
Also used : HashMap(java.util.HashMap) RecoveredContainerState(org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.RecoveredContainerState) ArrayList(java.util.ArrayList) Resource(org.apache.hadoop.yarn.api.records.Resource) LocalResource(org.apache.hadoop.yarn.api.records.LocalResource) Token(org.apache.hadoop.yarn.api.records.Token) ApplicationAttemptId(org.apache.hadoop.yarn.api.records.ApplicationAttemptId) ContainerLaunchContext(org.apache.hadoop.yarn.api.records.ContainerLaunchContext) ByteBuffer(java.nio.ByteBuffer) LocalResource(org.apache.hadoop.yarn.api.records.LocalResource) ContainerTokenIdentifier(org.apache.hadoop.yarn.security.ContainerTokenIdentifier) StartContainerRequest(org.apache.hadoop.yarn.api.protocolrecords.StartContainerRequest) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) ApplicationAccessType(org.apache.hadoop.yarn.api.records.ApplicationAccessType) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) Test(org.junit.Test)

Aggregations

RecoveredContainerState (org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.RecoveredContainerState)3 StartContainerRequest (org.apache.hadoop.yarn.api.protocolrecords.StartContainerRequest)2 ApplicationAttemptId (org.apache.hadoop.yarn.api.records.ApplicationAttemptId)2 ApplicationId (org.apache.hadoop.yarn.api.records.ApplicationId)2 ContainerId (org.apache.hadoop.yarn.api.records.ContainerId)2 Test (org.junit.Test)2 ByteBuffer (java.nio.ByteBuffer)1 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 ApplicationAccessType (org.apache.hadoop.yarn.api.records.ApplicationAccessType)1 ContainerLaunchContext (org.apache.hadoop.yarn.api.records.ContainerLaunchContext)1 LocalResource (org.apache.hadoop.yarn.api.records.LocalResource)1 Resource (org.apache.hadoop.yarn.api.records.Resource)1 Token (org.apache.hadoop.yarn.api.records.Token)1 ContainerManagerApplicationProto (org.apache.hadoop.yarn.proto.YarnServerNodemanagerRecoveryProtos.ContainerManagerApplicationProto)1 ContainerTokenIdentifier (org.apache.hadoop.yarn.security.ContainerTokenIdentifier)1 NMStateStoreService (org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService)1 RecoveredApplicationsState (org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.RecoveredApplicationsState)1 DB (org.iq80.leveldb.DB)1