use of org.apache.samza.clustermanager.container.placement.ContainerPlacementMetadata in project samza by apache.
the class ContainerManager method registerContainerPlacementAction.
/**
* Registers a container placement action to move the running container to destination host, if destination host is same as the
* host on which container is running, container placement action is treated as a restart.
*
* When host affinity is disabled a move / restart is only allowed on ANY_HOST
* When host affinity is enabled move / restart is allowed on specific or ANY_HOST
*
* Container placement requests are tied to deploymentId which is currently {@link org.apache.samza.config.ApplicationConfig#APP_RUN_ID}
* On job restarts container placement requests queued for the previous deployment are deleted using this
*
* All kinds of container placement request except for when destination host is "FORCE_RESTART_LAST_SEEN" work with
* a RESERVE - STOP - START policy, which means resources are accrued first before issuing a container stop, failure to
* do so will leave the running container untouched. Requests with destination host "FORCE_RESTART_LAST_SEEN" works with
* STOP - RESERVE - START policy, which means running container is stopped first then resource request are issued, this case
* is equivalent to doing a kill -9 on a container
*
* @param requestMessage request containing logical processor id 0,1,2 and host where container is desired to be moved,
* acceptable values of this param are
* - valid hostname
* - "ANY_HOST" in this case the request is sent to resource manager for any host
* - "LAST_SEEN" in this case request is sent to resource manager for last seen host
* - "FORCE_RESTART_LAST_SEEN" in this case request is sent to resource manager for last seen host
* @param containerAllocator to request physical resources
*/
public void registerContainerPlacementAction(ContainerPlacementRequestMessage requestMessage, ContainerAllocator containerAllocator) {
String processorId = requestMessage.getProcessorId();
String destinationHost = requestMessage.getDestinationHost();
// Is the action ready to be de-queued and taken or it needs to wait to be executed in future
if (!deQueueAction(requestMessage)) {
return;
}
LOG.info("ContainerPlacement action is de-queued metadata: {}", requestMessage);
Pair<ContainerPlacementMessage.StatusCode, String> actionStatus = validatePlacementAction(requestMessage);
// Action is de-queued upon so we record it in the cache
placementRequestsCache.put(requestMessage.getUuid());
// Remove the request message from metastore since this message is already acted upon
containerPlacementMetadataStore.deleteContainerPlacementRequestMessage(requestMessage.getUuid());
// Request is bad just update the response on message & return
if (actionStatus.getKey() == ContainerPlacementMessage.StatusCode.BAD_REQUEST) {
LOG.info("Status updated for ContainerPlacement action request: {} response: {}", requestMessage, actionStatus.getValue());
writeContainerPlacementResponseMessage(requestMessage, actionStatus.getKey(), actionStatus.getValue());
return;
}
/*
* When destination host is {@code FORCE_RESTART_LAST_SEEN} its treated as eqvivalent to kill -9 operation for the container
* In this scenario container is stopped first and we fallback to normal restart path so the policy here is
* stop - reserve - move
*/
if (destinationHost.equals(FORCE_RESTART_LAST_SEEN)) {
LOG.info("Issuing a force restart for Processor ID: {} for ContainerPlacement action request {}", processorId, requestMessage);
clusterResourceManager.stopStreamProcessor(samzaApplicationState.runningProcessors.get(processorId));
writeContainerPlacementResponseMessage(requestMessage, ContainerPlacementMessage.StatusCode.SUCCEEDED, "Successfully issued a stop container request falling back to normal restart path");
return;
}
/**
* When destination host is {@code LAST_SEEN} its treated as a restart request on the host where container is running
* on or has been seen last, but in this policy would be reserve - stop - move, which means reserve resources first
* only if resources are accrued stop the active container and issue a start on it on resource acquired
*/
if (destinationHost.equals(LAST_SEEN)) {
String lastSeenHost = getSourceHostForContainer(requestMessage);
LOG.info("Changing the requested host for placement action to {} because requested host is LAST_SEEN", lastSeenHost);
destinationHost = lastSeenHost;
}
// TODO: SAMZA-2457: Allow host affinity disabled jobs to move containers to specific host
if (!hostAffinityEnabled) {
LOG.info("Changing the requested host for placement action to {} because host affinity is disabled", ResourceRequestState.ANY_HOST);
destinationHost = ANY_HOST;
}
// Register metadata
ContainerPlacementMetadata actionMetaData = new ContainerPlacementMetadata(requestMessage, getSourceHostForContainer(requestMessage));
actions.put(processorId, actionMetaData);
// If the job is running in a degraded state then the container is already stopped
if (samzaApplicationState.failedProcessors.containsKey(requestMessage.getProcessorId())) {
actionMetaData.setContainerStatus(ContainerPlacementMetadata.ContainerStatus.STOPPED);
}
SamzaResourceRequest resourceRequest = containerAllocator.getResourceRequest(processorId, destinationHost);
// Record the resource request for monitoring
actionMetaData.recordResourceRequest(resourceRequest);
actions.put(processorId, actionMetaData);
updateContainerPlacementActionStatus(actionMetaData, ContainerPlacementMessage.StatusCode.IN_PROGRESS, "Preferred Resources requested");
containerAllocator.issueResourceRequest(resourceRequest);
}
use of org.apache.samza.clustermanager.container.placement.ContainerPlacementMetadata in project samza by apache.
the class ContainerManager method handleContainerLaunchSuccess.
/**
* Handles the state update on successful launch of a container, if this launch is due to a container placement action updates the
* related metadata to report success
*
* @param processorId logical processor id of container 0,1,2
*/
void handleContainerLaunchSuccess(String processorId, String containerHost) {
if (hasActiveContainerPlacementAction(processorId)) {
ContainerPlacementMetadata metadata = getPlacementActionMetadata(processorId).get();
// Mark the active container running again and dispatch a response
metadata.setContainerStatus(ContainerPlacementMetadata.ContainerStatus.RUNNING);
updateContainerPlacementActionStatus(metadata, ContainerPlacementMessage.StatusCode.SUCCEEDED, "Successfully completed the container placement action started container on host " + containerHost);
}
}
use of org.apache.samza.clustermanager.container.placement.ContainerPlacementMetadata in project samza by apache.
the class ContainerManager method handleContainerLaunch.
/**
* Handles the container start action for both active & standby containers. This method is invoked by the allocator thread
*
* Case 1. If the container launch request is due to an existing container placement action, issue a stop on active
* container & wait for the active container to be stopped before issuing a start.
* Case 2. If StandbyContainer is present refer to {@code StandbyContainerManager#checkStandbyConstraintsAndRunStreamProcessor}
* Case 3. Otherwise just invoke a container start on the allocated resource for the pending request
*
* TODO: SAMZA-2399: Investigate & configure a timeout for container stop if needed
*
* @param request pending request for the preferred host
* @param preferredHost preferred host to start the container
* @param allocatedResource resource allocated from {@link ClusterResourceManager}
* @param resourceRequestState state of request in {@link ContainerAllocator}
* @param allocator to request resources from @{@link ClusterResourceManager}
*
* @return true if the container launch is complete, false if the container launch is in progress. A container launch
* might be in progress when it is waiting for the previous container incarnation to stop in case of container
* placement actions
*/
boolean handleContainerLaunch(SamzaResourceRequest request, String preferredHost, SamzaResource allocatedResource, ResourceRequestState resourceRequestState, ContainerAllocator allocator) {
if (hasActiveContainerPlacementAction(request.getProcessorId())) {
String processorId = request.getProcessorId();
ContainerPlacementMetadata actionMetaData = getPlacementActionMetadata(processorId).get();
ContainerPlacementMetadata.ContainerStatus actionStatus = actionMetaData.getContainerStatus();
if (samzaApplicationState.runningProcessors.containsKey(processorId) && actionStatus == ContainerPlacementMetadata.ContainerStatus.RUNNING) {
LOG.debug("Requesting running container to shutdown due to existing ContainerPlacement action {}", actionMetaData);
actionMetaData.setContainerStatus(ContainerPlacementMetadata.ContainerStatus.STOP_IN_PROGRESS);
updateContainerPlacementActionStatus(actionMetaData, ContainerPlacementMessage.StatusCode.IN_PROGRESS, "Active container stop in progress");
clusterResourceManager.stopStreamProcessor(samzaApplicationState.runningProcessors.get(processorId));
return false;
} else if (actionStatus == ContainerPlacementMetadata.ContainerStatus.STOP_IN_PROGRESS) {
LOG.info("Waiting for running container to shutdown due to existing ContainerPlacement action {}", actionMetaData);
return false;
} else if (actionStatus == ContainerPlacementMetadata.ContainerStatus.STOP_FAILED) {
LOG.info("Shutdown on running container failed for action {}", actionMetaData);
markContainerPlacementActionFailed(actionMetaData, String.format("failed to stop container on current host %s", actionMetaData.getSourceHost()));
resourceRequestState.cancelResourceRequest(request);
return true;
} else if (actionStatus == ContainerPlacementMetadata.ContainerStatus.STOPPED) {
// Note: Always check constraints against allocated resource, since preferred host can be ANY_HOST as well
if (standbyContainerManager.isPresent() && !standbyContainerManager.get().checkStandbyConstraints(request.getProcessorId(), allocatedResource.getHost())) {
LOG.info("Starting container {} on host {} does not meet standby constraints, falling back to source host placement metadata: {}", request.getProcessorId(), preferredHost, actionMetaData);
// Release unstartable container
standbyContainerManager.get().releaseUnstartableContainer(request, allocatedResource, preferredHost, resourceRequestState);
// Fallback to source host since the new allocated resource does not meet standby constraints
allocator.requestResource(processorId, actionMetaData.getSourceHost());
markContainerPlacementActionFailed(actionMetaData, String.format("allocated resource %s does not meet standby constraints now, falling back to source host", allocatedResource));
} else {
LOG.info("Status updated for ContainerPlacement action: ", actionMetaData);
allocator.runStreamProcessor(request, preferredHost);
}
return true;
}
}
if (this.standbyContainerManager.isPresent()) {
standbyContainerManager.get().checkStandbyConstraintsAndRunStreamProcessor(request, preferredHost, allocatedResource, allocator, resourceRequestState);
} else {
allocator.runStreamProcessor(request, preferredHost);
}
return true;
}
use of org.apache.samza.clustermanager.container.placement.ContainerPlacementMetadata in project samza by apache.
the class TestContainerPlacementActions method testContainerPlacementsForJobRunningInDegradedState.
@Test(timeout = 20000)
public void testContainerPlacementsForJobRunningInDegradedState() throws Exception {
// Set failure after retries to false to enable job running in degraded state
config = new MapConfig(configVals, getConfigWithHostAffinityAndRetries(true, 1, false));
state = new SamzaApplicationState(JobModelManagerTestUtil.getJobModelManager(getConfig(), 2, this.server));
callback = mock(ClusterResourceManager.Callback.class);
MockClusterResourceManager clusterResourceManager = new MockClusterResourceManager(callback, state);
FaultDomainManager faultDomainManager = mock(FaultDomainManager.class);
ClusterManagerConfig clusterManagerConfig = new ClusterManagerConfig(config);
containerManager = spy(new ContainerManager(containerPlacementMetadataStore, state, clusterResourceManager, true, false, localityManager, faultDomainManager, config));
allocatorWithHostAffinity = new MockContainerAllocatorWithHostAffinity(clusterResourceManager, config, state, containerManager);
cpm = new ContainerProcessManager(clusterManagerConfig, state, new MetricsRegistryMap(), clusterResourceManager, Optional.of(allocatorWithHostAffinity), containerManager, localityManager, false);
doAnswer(new Answer<Void>() {
public Void answer(InvocationOnMock invocation) {
new Thread(() -> {
Object[] args = invocation.getArguments();
cpm.onResourcesAvailable((List<SamzaResource>) args[0]);
}, "AMRMClientAsync").start();
return null;
}
}).when(callback).onResourcesAvailable(anyList());
// Mimic stream processor launch failure only on host-2,
doAnswer(new Answer<Void>() {
public Void answer(InvocationOnMock invocation) {
new Thread(() -> {
Object[] args = invocation.getArguments();
cpm.onStreamProcessorLaunchSuccess((SamzaResource) args[0]);
}, "AMRMClientAsync").start();
return null;
}
}).when(callback).onStreamProcessorLaunchSuccess(any());
doAnswer(new Answer<Void>() {
public Void answer(InvocationOnMock invocation) {
new Thread(() -> {
Object[] args = invocation.getArguments();
cpm.onResourcesCompleted((List<SamzaResourceStatus>) args[0]);
}, "AMRMClientAsync").start();
return null;
}
}).when(callback).onResourcesCompleted(anyList());
cpm.start();
if (!allocatorWithHostAffinity.awaitContainersStart(2, 5, TimeUnit.SECONDS)) {
fail("timed out waiting for the containers to start");
}
while (state.runningProcessors.size() != 2) {
Thread.sleep(100);
}
// App is in running state with two containers running
assertEquals(state.runningProcessors.size(), 2);
assertEquals(state.runningProcessors.get("0").getHost(), "host-1");
assertEquals(state.runningProcessors.get("1").getHost(), "host-2");
assertEquals(state.preferredHostRequests.get(), 2);
assertEquals(state.anyHostRequests.get(), 0);
// Trigger a container failure
clusterResourceManager.stopStreamProcessor(state.runningProcessors.get("1"), -103);
// Wait for container to start
if (!allocatorWithHostAffinity.awaitContainersStart(1, 2, TimeUnit.SECONDS)) {
fail("timed out waiting for the containers to start");
}
while (state.runningProcessors.size() != 2) {
Thread.sleep(100);
}
// Trigger a container failure again
clusterResourceManager.stopStreamProcessor(state.runningProcessors.get("1"), -103);
// Ensure that this container has exhausted all retires
while (state.failedProcessors.size() != 1 && state.runningProcessors.size() != 1) {
Thread.sleep(100);
}
// At this point the application should only have one container running
assertEquals(state.runningProcessors.get("0").getHost(), "host-1");
assertEquals(state.runningProcessors.size(), 1);
assertEquals(state.pendingProcessors.size(), 0);
assertTrue(state.failedProcessors.containsKey("1"));
ContainerPlacementRequestMessage requestMessage = new ContainerPlacementRequestMessage(UUID.randomUUID(), "app-attempt-001", "1", "host-3", System.currentTimeMillis());
ContainerPlacementMetadata metadata = containerManager.registerContainerPlacementActionForTest(requestMessage, allocatorWithHostAffinity);
// Wait for the ControlAction to complete
if (!allocatorWithHostAffinity.awaitContainersStart(1, 2, TimeUnit.SECONDS)) {
fail("timed out waiting for the containers to start");
}
// Wait for both the containers to be in running state & control action metadata to succeed
while (state.runningProcessors.size() != 2 && metadata.getActionStatus() != ContainerPlacementMessage.StatusCode.SUCCEEDED) {
Thread.sleep(100);
}
assertEquals(state.preferredHostRequests.get(), 4);
assertEquals(state.runningProcessors.size(), 2);
// Container 1 should not go to host-3
assertEquals(state.runningProcessors.get("0").getHost(), "host-1");
assertEquals(state.runningProcessors.get("1").getHost(), "host-3");
assertEquals(state.anyHostRequests.get(), 0);
// Failed processors must be empty
assertEquals(state.failedProcessors.size(), 0);
}
use of org.apache.samza.clustermanager.container.placement.ContainerPlacementMetadata in project samza by apache.
the class TestContainerPlacementActions method assertBadRequests.
private void assertBadRequests(String processorId, String destinationHost, ContainerManager containerManager, ContainerAllocator allocator) throws InterruptedException {
ContainerPlacementRequestMessage requestMessage = new ContainerPlacementRequestMessage(UUID.randomUUID(), "app-Attemp-001", processorId, destinationHost, System.currentTimeMillis());
ContainerPlacementMetadata metadata = containerManager.registerContainerPlacementActionForTest(requestMessage, allocator);
assertNull(metadata);
Optional<ContainerPlacementResponseMessage> responseMessage = containerPlacementMetadataStore.readContainerPlacementResponseMessage(requestMessage.getUuid());
while (true) {
if (responseMessage.isPresent() && responseMessage.get().getStatusCode() == ContainerPlacementMessage.StatusCode.BAD_REQUEST) {
break;
}
Thread.sleep(100);
responseMessage = containerPlacementMetadataStore.readContainerPlacementResponseMessage(requestMessage.getUuid());
}
assertEquals(responseMessage.get().getStatusCode(), ContainerPlacementMessage.StatusCode.BAD_REQUEST);
assertResponseMessage(responseMessage.get(), requestMessage);
// Request shall be deleted as soon as it is acted upon
assertFalse(containerPlacementMetadataStore.readContainerPlacementRequestMessage(requestMessage.getUuid()).isPresent());
}
Aggregations