Search in sources :

Example 1 with ContainerPlacementMetadata

use of org.apache.samza.clustermanager.container.placement.ContainerPlacementMetadata in project samza by apache.

the class ContainerManager method registerContainerPlacementAction.

/**
 * Registers a container placement action to move the running container to destination host, if destination host is same as the
 * host on which container is running, container placement action is treated as a restart.
 *
 * When host affinity is disabled a move / restart is only allowed on ANY_HOST
 * When host affinity is enabled move / restart is allowed on specific or ANY_HOST
 *
 * Container placement requests are tied to deploymentId which is currently {@link org.apache.samza.config.ApplicationConfig#APP_RUN_ID}
 * On job restarts container placement requests queued for the previous deployment are deleted using this
 *
 * All kinds of container placement request except for when destination host is "FORCE_RESTART_LAST_SEEN" work with
 * a RESERVE - STOP - START policy, which means resources are accrued first before issuing a container stop, failure to
 * do so will leave the running container untouched. Requests with destination host "FORCE_RESTART_LAST_SEEN" works with
 * STOP - RESERVE - START policy, which means running container is stopped first then resource request are issued, this case
 * is equivalent to doing a kill -9 on a container
 *
 * @param requestMessage request containing logical processor id 0,1,2 and host where container is desired to be moved,
 *                       acceptable values of this param are
 *                       - valid hostname
 *                       - "ANY_HOST" in this case the request is sent to resource manager for any host
 *                       - "LAST_SEEN" in this case request is sent to resource manager for last seen host
 *                       - "FORCE_RESTART_LAST_SEEN" in this case request is sent to resource manager for last seen host
 * @param containerAllocator to request physical resources
 */
public void registerContainerPlacementAction(ContainerPlacementRequestMessage requestMessage, ContainerAllocator containerAllocator) {
    String processorId = requestMessage.getProcessorId();
    String destinationHost = requestMessage.getDestinationHost();
    // Is the action ready to be de-queued and taken or it needs to wait to be executed in future
    if (!deQueueAction(requestMessage)) {
        return;
    }
    LOG.info("ContainerPlacement action is de-queued metadata: {}", requestMessage);
    Pair<ContainerPlacementMessage.StatusCode, String> actionStatus = validatePlacementAction(requestMessage);
    // Action is de-queued upon so we record it in the cache
    placementRequestsCache.put(requestMessage.getUuid());
    // Remove the request message from metastore since this message is already acted upon
    containerPlacementMetadataStore.deleteContainerPlacementRequestMessage(requestMessage.getUuid());
    // Request is bad just update the response on message & return
    if (actionStatus.getKey() == ContainerPlacementMessage.StatusCode.BAD_REQUEST) {
        LOG.info("Status updated for ContainerPlacement action request: {} response: {}", requestMessage, actionStatus.getValue());
        writeContainerPlacementResponseMessage(requestMessage, actionStatus.getKey(), actionStatus.getValue());
        return;
    }
    /*
     * When destination host is {@code FORCE_RESTART_LAST_SEEN} its treated as eqvivalent to kill -9 operation for the container
     * In this scenario container is stopped first and we fallback to normal restart path so the policy here is
     * stop - reserve - move
     */
    if (destinationHost.equals(FORCE_RESTART_LAST_SEEN)) {
        LOG.info("Issuing a force restart for Processor ID: {} for ContainerPlacement action request {}", processorId, requestMessage);
        clusterResourceManager.stopStreamProcessor(samzaApplicationState.runningProcessors.get(processorId));
        writeContainerPlacementResponseMessage(requestMessage, ContainerPlacementMessage.StatusCode.SUCCEEDED, "Successfully issued a stop container request falling back to normal restart path");
        return;
    }
    /**
     * When destination host is {@code LAST_SEEN} its treated as a restart request on the host where container is running
     * on or has been seen last, but in this policy would be reserve - stop - move, which means reserve resources first
     * only if resources are accrued stop the active container and issue a start on it on resource acquired
     */
    if (destinationHost.equals(LAST_SEEN)) {
        String lastSeenHost = getSourceHostForContainer(requestMessage);
        LOG.info("Changing the requested host for placement action to {} because requested host is LAST_SEEN", lastSeenHost);
        destinationHost = lastSeenHost;
    }
    // TODO: SAMZA-2457: Allow host affinity disabled jobs to move containers to specific host
    if (!hostAffinityEnabled) {
        LOG.info("Changing the requested host for placement action to {} because host affinity is disabled", ResourceRequestState.ANY_HOST);
        destinationHost = ANY_HOST;
    }
    // Register metadata
    ContainerPlacementMetadata actionMetaData = new ContainerPlacementMetadata(requestMessage, getSourceHostForContainer(requestMessage));
    actions.put(processorId, actionMetaData);
    // If the job is running in a degraded state then the container is already stopped
    if (samzaApplicationState.failedProcessors.containsKey(requestMessage.getProcessorId())) {
        actionMetaData.setContainerStatus(ContainerPlacementMetadata.ContainerStatus.STOPPED);
    }
    SamzaResourceRequest resourceRequest = containerAllocator.getResourceRequest(processorId, destinationHost);
    // Record the resource request for monitoring
    actionMetaData.recordResourceRequest(resourceRequest);
    actions.put(processorId, actionMetaData);
    updateContainerPlacementActionStatus(actionMetaData, ContainerPlacementMessage.StatusCode.IN_PROGRESS, "Preferred Resources requested");
    containerAllocator.issueResourceRequest(resourceRequest);
}
Also used : ContainerPlacementMetadata(org.apache.samza.clustermanager.container.placement.ContainerPlacementMetadata)

Example 2 with ContainerPlacementMetadata

use of org.apache.samza.clustermanager.container.placement.ContainerPlacementMetadata in project samza by apache.

the class ContainerManager method handleContainerLaunchSuccess.

/**
 * Handles the state update on successful launch of a container, if this launch is due to a container placement action updates the
 * related metadata to report success
 *
 * @param processorId logical processor id of container 0,1,2
 */
void handleContainerLaunchSuccess(String processorId, String containerHost) {
    if (hasActiveContainerPlacementAction(processorId)) {
        ContainerPlacementMetadata metadata = getPlacementActionMetadata(processorId).get();
        // Mark the active container running again and dispatch a response
        metadata.setContainerStatus(ContainerPlacementMetadata.ContainerStatus.RUNNING);
        updateContainerPlacementActionStatus(metadata, ContainerPlacementMessage.StatusCode.SUCCEEDED, "Successfully completed the container placement action started container on host " + containerHost);
    }
}
Also used : ContainerPlacementMetadata(org.apache.samza.clustermanager.container.placement.ContainerPlacementMetadata)

Example 3 with ContainerPlacementMetadata

use of org.apache.samza.clustermanager.container.placement.ContainerPlacementMetadata in project samza by apache.

the class ContainerManager method handleContainerLaunch.

/**
 * Handles the container start action for both active & standby containers. This method is invoked by the allocator thread
 *
 * Case 1. If the container launch request is due to an existing container placement action, issue a stop on active
 *         container & wait for the active container to be stopped before issuing a start.
 * Case 2. If StandbyContainer is present refer to {@code StandbyContainerManager#checkStandbyConstraintsAndRunStreamProcessor}
 * Case 3. Otherwise just invoke a container start on the allocated resource for the pending request
 *
 * TODO: SAMZA-2399: Investigate & configure a timeout for container stop if needed
 *
 * @param request pending request for the preferred host
 * @param preferredHost preferred host to start the container
 * @param allocatedResource resource allocated from {@link ClusterResourceManager}
 * @param resourceRequestState state of request in {@link ContainerAllocator}
 * @param allocator to request resources from @{@link ClusterResourceManager}
 *
 * @return true if the container launch is complete, false if the container launch is in progress. A container launch
 *         might be in progress when it is waiting for the previous container incarnation to stop in case of container
 *         placement actions
 */
boolean handleContainerLaunch(SamzaResourceRequest request, String preferredHost, SamzaResource allocatedResource, ResourceRequestState resourceRequestState, ContainerAllocator allocator) {
    if (hasActiveContainerPlacementAction(request.getProcessorId())) {
        String processorId = request.getProcessorId();
        ContainerPlacementMetadata actionMetaData = getPlacementActionMetadata(processorId).get();
        ContainerPlacementMetadata.ContainerStatus actionStatus = actionMetaData.getContainerStatus();
        if (samzaApplicationState.runningProcessors.containsKey(processorId) && actionStatus == ContainerPlacementMetadata.ContainerStatus.RUNNING) {
            LOG.debug("Requesting running container to shutdown due to existing ContainerPlacement action {}", actionMetaData);
            actionMetaData.setContainerStatus(ContainerPlacementMetadata.ContainerStatus.STOP_IN_PROGRESS);
            updateContainerPlacementActionStatus(actionMetaData, ContainerPlacementMessage.StatusCode.IN_PROGRESS, "Active container stop in progress");
            clusterResourceManager.stopStreamProcessor(samzaApplicationState.runningProcessors.get(processorId));
            return false;
        } else if (actionStatus == ContainerPlacementMetadata.ContainerStatus.STOP_IN_PROGRESS) {
            LOG.info("Waiting for running container to shutdown due to existing ContainerPlacement action {}", actionMetaData);
            return false;
        } else if (actionStatus == ContainerPlacementMetadata.ContainerStatus.STOP_FAILED) {
            LOG.info("Shutdown on running container failed for action {}", actionMetaData);
            markContainerPlacementActionFailed(actionMetaData, String.format("failed to stop container on current host %s", actionMetaData.getSourceHost()));
            resourceRequestState.cancelResourceRequest(request);
            return true;
        } else if (actionStatus == ContainerPlacementMetadata.ContainerStatus.STOPPED) {
            // Note: Always check constraints against allocated resource, since preferred host can be ANY_HOST as well
            if (standbyContainerManager.isPresent() && !standbyContainerManager.get().checkStandbyConstraints(request.getProcessorId(), allocatedResource.getHost())) {
                LOG.info("Starting container {} on host {} does not meet standby constraints, falling back to source host placement metadata: {}", request.getProcessorId(), preferredHost, actionMetaData);
                // Release unstartable container
                standbyContainerManager.get().releaseUnstartableContainer(request, allocatedResource, preferredHost, resourceRequestState);
                // Fallback to source host since the new allocated resource does not meet standby constraints
                allocator.requestResource(processorId, actionMetaData.getSourceHost());
                markContainerPlacementActionFailed(actionMetaData, String.format("allocated resource %s does not meet standby constraints now, falling back to source host", allocatedResource));
            } else {
                LOG.info("Status updated for ContainerPlacement action: ", actionMetaData);
                allocator.runStreamProcessor(request, preferredHost);
            }
            return true;
        }
    }
    if (this.standbyContainerManager.isPresent()) {
        standbyContainerManager.get().checkStandbyConstraintsAndRunStreamProcessor(request, preferredHost, allocatedResource, allocator, resourceRequestState);
    } else {
        allocator.runStreamProcessor(request, preferredHost);
    }
    return true;
}
Also used : ContainerPlacementMetadata(org.apache.samza.clustermanager.container.placement.ContainerPlacementMetadata)

Example 4 with ContainerPlacementMetadata

use of org.apache.samza.clustermanager.container.placement.ContainerPlacementMetadata in project samza by apache.

the class TestContainerPlacementActions method testContainerPlacementsForJobRunningInDegradedState.

@Test(timeout = 20000)
public void testContainerPlacementsForJobRunningInDegradedState() throws Exception {
    // Set failure after retries to false to enable job running in degraded state
    config = new MapConfig(configVals, getConfigWithHostAffinityAndRetries(true, 1, false));
    state = new SamzaApplicationState(JobModelManagerTestUtil.getJobModelManager(getConfig(), 2, this.server));
    callback = mock(ClusterResourceManager.Callback.class);
    MockClusterResourceManager clusterResourceManager = new MockClusterResourceManager(callback, state);
    FaultDomainManager faultDomainManager = mock(FaultDomainManager.class);
    ClusterManagerConfig clusterManagerConfig = new ClusterManagerConfig(config);
    containerManager = spy(new ContainerManager(containerPlacementMetadataStore, state, clusterResourceManager, true, false, localityManager, faultDomainManager, config));
    allocatorWithHostAffinity = new MockContainerAllocatorWithHostAffinity(clusterResourceManager, config, state, containerManager);
    cpm = new ContainerProcessManager(clusterManagerConfig, state, new MetricsRegistryMap(), clusterResourceManager, Optional.of(allocatorWithHostAffinity), containerManager, localityManager, false);
    doAnswer(new Answer<Void>() {

        public Void answer(InvocationOnMock invocation) {
            new Thread(() -> {
                Object[] args = invocation.getArguments();
                cpm.onResourcesAvailable((List<SamzaResource>) args[0]);
            }, "AMRMClientAsync").start();
            return null;
        }
    }).when(callback).onResourcesAvailable(anyList());
    // Mimic stream processor launch failure only on host-2,
    doAnswer(new Answer<Void>() {

        public Void answer(InvocationOnMock invocation) {
            new Thread(() -> {
                Object[] args = invocation.getArguments();
                cpm.onStreamProcessorLaunchSuccess((SamzaResource) args[0]);
            }, "AMRMClientAsync").start();
            return null;
        }
    }).when(callback).onStreamProcessorLaunchSuccess(any());
    doAnswer(new Answer<Void>() {

        public Void answer(InvocationOnMock invocation) {
            new Thread(() -> {
                Object[] args = invocation.getArguments();
                cpm.onResourcesCompleted((List<SamzaResourceStatus>) args[0]);
            }, "AMRMClientAsync").start();
            return null;
        }
    }).when(callback).onResourcesCompleted(anyList());
    cpm.start();
    if (!allocatorWithHostAffinity.awaitContainersStart(2, 5, TimeUnit.SECONDS)) {
        fail("timed out waiting for the containers to start");
    }
    while (state.runningProcessors.size() != 2) {
        Thread.sleep(100);
    }
    // App is in running state with two containers running
    assertEquals(state.runningProcessors.size(), 2);
    assertEquals(state.runningProcessors.get("0").getHost(), "host-1");
    assertEquals(state.runningProcessors.get("1").getHost(), "host-2");
    assertEquals(state.preferredHostRequests.get(), 2);
    assertEquals(state.anyHostRequests.get(), 0);
    // Trigger a container failure
    clusterResourceManager.stopStreamProcessor(state.runningProcessors.get("1"), -103);
    // Wait for container to start
    if (!allocatorWithHostAffinity.awaitContainersStart(1, 2, TimeUnit.SECONDS)) {
        fail("timed out waiting for the containers to start");
    }
    while (state.runningProcessors.size() != 2) {
        Thread.sleep(100);
    }
    // Trigger a container failure again
    clusterResourceManager.stopStreamProcessor(state.runningProcessors.get("1"), -103);
    // Ensure that this container has exhausted all retires
    while (state.failedProcessors.size() != 1 && state.runningProcessors.size() != 1) {
        Thread.sleep(100);
    }
    // At this point the application should only have one container running
    assertEquals(state.runningProcessors.get("0").getHost(), "host-1");
    assertEquals(state.runningProcessors.size(), 1);
    assertEquals(state.pendingProcessors.size(), 0);
    assertTrue(state.failedProcessors.containsKey("1"));
    ContainerPlacementRequestMessage requestMessage = new ContainerPlacementRequestMessage(UUID.randomUUID(), "app-attempt-001", "1", "host-3", System.currentTimeMillis());
    ContainerPlacementMetadata metadata = containerManager.registerContainerPlacementActionForTest(requestMessage, allocatorWithHostAffinity);
    // Wait for the ControlAction to complete
    if (!allocatorWithHostAffinity.awaitContainersStart(1, 2, TimeUnit.SECONDS)) {
        fail("timed out waiting for the containers to start");
    }
    // Wait for both the containers to be in running state & control action metadata to succeed
    while (state.runningProcessors.size() != 2 && metadata.getActionStatus() != ContainerPlacementMessage.StatusCode.SUCCEEDED) {
        Thread.sleep(100);
    }
    assertEquals(state.preferredHostRequests.get(), 4);
    assertEquals(state.runningProcessors.size(), 2);
    // Container 1 should not go to host-3
    assertEquals(state.runningProcessors.get("0").getHost(), "host-1");
    assertEquals(state.runningProcessors.get("1").getHost(), "host-3");
    assertEquals(state.anyHostRequests.get(), 0);
    // Failed processors must be empty
    assertEquals(state.failedProcessors.size(), 0);
}
Also used : ClusterManagerConfig(org.apache.samza.config.ClusterManagerConfig) InvocationOnMock(org.mockito.invocation.InvocationOnMock) MapConfig(org.apache.samza.config.MapConfig) ContainerPlacementMetadata(org.apache.samza.clustermanager.container.placement.ContainerPlacementMetadata) MetricsRegistryMap(org.apache.samza.metrics.MetricsRegistryMap) ContainerPlacementRequestMessage(org.apache.samza.container.placement.ContainerPlacementRequestMessage) Test(org.junit.Test)

Example 5 with ContainerPlacementMetadata

use of org.apache.samza.clustermanager.container.placement.ContainerPlacementMetadata in project samza by apache.

the class TestContainerPlacementActions method assertBadRequests.

private void assertBadRequests(String processorId, String destinationHost, ContainerManager containerManager, ContainerAllocator allocator) throws InterruptedException {
    ContainerPlacementRequestMessage requestMessage = new ContainerPlacementRequestMessage(UUID.randomUUID(), "app-Attemp-001", processorId, destinationHost, System.currentTimeMillis());
    ContainerPlacementMetadata metadata = containerManager.registerContainerPlacementActionForTest(requestMessage, allocator);
    assertNull(metadata);
    Optional<ContainerPlacementResponseMessage> responseMessage = containerPlacementMetadataStore.readContainerPlacementResponseMessage(requestMessage.getUuid());
    while (true) {
        if (responseMessage.isPresent() && responseMessage.get().getStatusCode() == ContainerPlacementMessage.StatusCode.BAD_REQUEST) {
            break;
        }
        Thread.sleep(100);
        responseMessage = containerPlacementMetadataStore.readContainerPlacementResponseMessage(requestMessage.getUuid());
    }
    assertEquals(responseMessage.get().getStatusCode(), ContainerPlacementMessage.StatusCode.BAD_REQUEST);
    assertResponseMessage(responseMessage.get(), requestMessage);
    // Request shall be deleted as soon as it is acted upon
    assertFalse(containerPlacementMetadataStore.readContainerPlacementRequestMessage(requestMessage.getUuid()).isPresent());
}
Also used : ContainerPlacementMetadata(org.apache.samza.clustermanager.container.placement.ContainerPlacementMetadata) ContainerPlacementResponseMessage(org.apache.samza.container.placement.ContainerPlacementResponseMessage) ContainerPlacementRequestMessage(org.apache.samza.container.placement.ContainerPlacementRequestMessage)

Aggregations

ContainerPlacementMetadata (org.apache.samza.clustermanager.container.placement.ContainerPlacementMetadata)13 ContainerPlacementRequestMessage (org.apache.samza.container.placement.ContainerPlacementRequestMessage)6 ContainerPlacementResponseMessage (org.apache.samza.container.placement.ContainerPlacementResponseMessage)5 Test (org.junit.Test)5 InvocationOnMock (org.mockito.invocation.InvocationOnMock)5 ImmutableList (com.google.common.collect.ImmutableList)2 List (java.util.List)2 ClusterManagerConfig (org.apache.samza.config.ClusterManagerConfig)2 MapConfig (org.apache.samza.config.MapConfig)2 MetricsRegistryMap (org.apache.samza.metrics.MetricsRegistryMap)2 HashMap (java.util.HashMap)1