use of com.datatorrent.stram.api.StramEvent in project apex-core by apache.
the class StreamingAppMasterService method execute.
/**
* Main run function for the application master
*
* @throws YarnException
*/
@SuppressWarnings("SleepWhileInLoop")
private void execute() throws YarnException, IOException {
LOG.info("Starting ApplicationMaster");
final Configuration conf = getConfig();
if (UserGroupInformation.isSecurityEnabled()) {
tokenRenewer = new TokenRenewer(dag, true, conf, appAttemptID.getApplicationId().toString());
}
// Register self with ResourceManager
RegisterApplicationMasterResponse response = amRmClient.registerApplicationMaster(appMasterHostname, 0, appMasterTrackingUrl);
// Dump out information about cluster capability as seen by the resource manager
int maxMem = response.getMaximumResourceCapability().getMemory();
int maxVcores = response.getMaximumResourceCapability().getVirtualCores();
int minMem = conf.getInt("yarn.scheduler.minimum-allocation-mb", 0);
int minVcores = conf.getInt("yarn.scheduler.minimum-allocation-vcores", 0);
LOG.info("Max mem {}m, Min mem {}m, Max vcores {} and Min vcores {} capabililty of resources in this cluster ", maxMem, minMem, maxVcores, minVcores);
long blacklistRemovalTime = dag.getValue(DAGContext.BLACKLISTED_NODE_REMOVAL_TIME_MILLIS);
int maxConsecutiveContainerFailures = dag.getValue(DAGContext.MAX_CONSECUTIVE_CONTAINER_FAILURES_FOR_BLACKLIST);
LOG.info("Blacklist removal time in millis = {}, max consecutive node failure count = {}", blacklistRemovalTime, maxConsecutiveContainerFailures);
// for locality relaxation fall back
Map<StreamingContainerAgent.ContainerStartRequest, MutablePair<Integer, ContainerRequest>> requestedResources = Maps.newHashMap();
// Setup heartbeat emitter
// TODO poll RM every now and then with an empty request to let RM know that we are alive
// The heartbeat interval after which an AM is timed out by the RM is defined by a config setting:
// RM_AM_EXPIRY_INTERVAL_MS with default defined by DEFAULT_RM_AM_EXPIRY_INTERVAL_MS
// The allocate calls to the RM count as heartbeat so, for now, this additional heartbeat emitter
// is not required.
int loopCounter = -1;
long nodeReportUpdateTime = 0;
// keep track of already requested containers to not request them again while waiting for allocation
int numRequestedContainers = 0;
int numReleasedContainers = 0;
int nextRequestPriority = 0;
// Use override for resource requestor in case of cloudera distribution, to handle host specific requests
ResourceRequestHandler resourceRequestor = System.getenv().containsKey("CDH_HADOOP_BIN") ? new BlacklistBasedResourceRequestHandler() : new ResourceRequestHandler();
List<ContainerStartRequest> pendingContainerStartRequests = new LinkedList<>();
try (YarnClient clientRMService = StramClientUtils.createYarnClient(conf)) {
try {
// YARN-435
// we need getClusterNodes to populate the initial node list,
// subsequent updates come through the heartbeat response
ApplicationReport ar = StramClientUtils.getStartedAppInstanceByName(clientRMService, dag.getAttributes().get(DAG.APPLICATION_NAME), UserGroupInformation.getLoginUser().getUserName(), dag.getAttributes().get(DAG.APPLICATION_ID));
if (ar != null) {
appDone = true;
dnmgr.shutdownDiagnosticsMessage = String.format("Application master failed due to application %s with duplicate application name \"%s\" by the same user \"%s\" is already started.", ar.getApplicationId().toString(), ar.getName(), ar.getUser());
LOG.info("Forced shutdown due to {}", dnmgr.shutdownDiagnosticsMessage);
finishApplication(FinalApplicationStatus.FAILED);
return;
}
resourceRequestor.updateNodeReports(clientRMService.getNodeReports());
nodeReportUpdateTime = System.currentTimeMillis() + UPDATE_NODE_REPORTS_INTERVAL;
} catch (Exception e) {
throw new RuntimeException("Failed to retrieve cluster nodes report.", e);
}
List<Container> containers = response.getContainersFromPreviousAttempts();
// Running containers might take a while to register with the new app master and send the heartbeat signal.
int waitForRecovery = containers.size() > 0 ? dag.getValue(LogicalPlan.HEARTBEAT_TIMEOUT_MILLIS) / 1000 : 0;
List<ContainerId> releasedContainers = previouslyAllocatedContainers(containers);
FinalApplicationStatus finalStatus = FinalApplicationStatus.SUCCEEDED;
final InetSocketAddress rmAddress = conf.getSocketAddr(YarnConfiguration.RM_ADDRESS, YarnConfiguration.DEFAULT_RM_ADDRESS, YarnConfiguration.DEFAULT_RM_PORT);
while (!appDone) {
loopCounter++;
final long currentTimeMillis = System.currentTimeMillis();
if (tokenRenewer != null) {
tokenRenewer.checkAndRenew();
}
if (currentTimeMillis > nodeReportUpdateTime) {
resourceRequestor.updateNodeReports(clientRMService.getNodeReports());
nodeReportUpdateTime = currentTimeMillis + UPDATE_NODE_REPORTS_INTERVAL;
}
Runnable r;
while ((r = this.pendingTasks.poll()) != null) {
r.run();
}
// need not have any available containers
try {
sleep(1000);
} catch (InterruptedException e) {
LOG.info("Sleep interrupted", e);
}
// Setup request to be sent to RM to allocate containers
List<ContainerRequest> containerRequests = new ArrayList<>();
List<ContainerRequest> removedContainerRequests = new ArrayList<>();
// request containers for pending deploy requests
if (!dnmgr.containerStartRequests.isEmpty()) {
StreamingContainerAgent.ContainerStartRequest csr;
while ((csr = dnmgr.containerStartRequests.poll()) != null) {
if (csr.container.getRequiredMemoryMB() > maxMem) {
LOG.warn("Container memory {}m above max threshold of cluster. Using max value {}m.", csr.container.getRequiredMemoryMB(), maxMem);
csr.container.setRequiredMemoryMB(maxMem);
}
if (csr.container.getRequiredMemoryMB() < minMem) {
csr.container.setRequiredMemoryMB(minMem);
}
if (csr.container.getRequiredVCores() > maxVcores) {
LOG.warn("Container vcores {} above max threshold of cluster. Using max value {}.", csr.container.getRequiredVCores(), maxVcores);
csr.container.setRequiredVCores(maxVcores);
}
if (csr.container.getRequiredVCores() < minVcores) {
csr.container.setRequiredVCores(minVcores);
}
csr.container.setResourceRequestPriority(nextRequestPriority++);
ContainerRequest cr = resourceRequestor.createContainerRequest(csr, true);
if (cr == null) {
pendingContainerStartRequests.add(csr);
} else {
resourceRequestor.addContainerRequest(requestedResources, loopCounter, containerRequests, csr, cr);
}
}
}
// If all other requests are allocated, retry pending requests which need host availability
if (containerRequests.isEmpty() && !pendingContainerStartRequests.isEmpty()) {
List<ContainerStartRequest> removalList = new LinkedList<>();
for (ContainerStartRequest csr : pendingContainerStartRequests) {
ContainerRequest cr = resourceRequestor.createContainerRequest(csr, true);
if (cr != null) {
resourceRequestor.addContainerRequest(requestedResources, loopCounter, containerRequests, csr, cr);
removalList.add(csr);
}
}
pendingContainerStartRequests.removeAll(removalList);
}
resourceRequestor.reissueContainerRequests(amRmClient, requestedResources, loopCounter, resourceRequestor, containerRequests, removedContainerRequests);
/* Remove nodes from blacklist after timeout */
List<String> blacklistRemovals = new ArrayList<>();
for (String hostname : failedBlackListedNodes) {
Long timeDiff = currentTimeMillis - failedContainerNodesMap.get(hostname).blackListAdditionTime;
if (timeDiff >= blacklistRemovalTime) {
blacklistRemovals.add(hostname);
failedContainerNodesMap.remove(hostname);
}
}
if (!blacklistRemovals.isEmpty()) {
amRmClient.updateBlacklist(null, blacklistRemovals);
LOG.info("Removing nodes {} from blacklist: time elapsed since last blacklisting due to failure is greater than specified timeout", blacklistRemovals.toString());
failedBlackListedNodes.removeAll(blacklistRemovals);
}
numRequestedContainers += containerRequests.size() - removedContainerRequests.size();
AllocateResponse amResp = sendContainerAskToRM(containerRequests, removedContainerRequests, releasedContainers);
if (amResp.getAMCommand() != null) {
LOG.info(" statement executed:{}", amResp.getAMCommand());
switch(amResp.getAMCommand()) {
case AM_RESYNC:
case AM_SHUTDOWN:
throw new YarnRuntimeException("Received the " + amResp.getAMCommand() + " command from RM");
default:
throw new YarnRuntimeException("Received the " + amResp.getAMCommand() + " command from RM");
}
}
releasedContainers.clear();
// Retrieve list of allocated containers from the response
List<Container> newAllocatedContainers = amResp.getAllocatedContainers();
// LOG.info("Got response from RM for container ask, allocatedCnt=" + newAllocatedContainers.size());
numRequestedContainers -= newAllocatedContainers.size();
long timestamp = System.currentTimeMillis();
for (Container allocatedContainer : newAllocatedContainers) {
LOG.info("Got new container." + ", containerId=" + allocatedContainer.getId() + ", containerNode=" + allocatedContainer.getNodeId() + ", containerNodeURI=" + allocatedContainer.getNodeHttpAddress() + ", containerResourceMemory" + allocatedContainer.getResource().getMemory() + ", priority" + allocatedContainer.getPriority());
// + ", containerToken" + allocatedContainer.getContainerToken().getIdentifier().toString());
boolean alreadyAllocated = true;
StreamingContainerAgent.ContainerStartRequest csr = null;
for (Map.Entry<StreamingContainerAgent.ContainerStartRequest, MutablePair<Integer, ContainerRequest>> entry : requestedResources.entrySet()) {
if (entry.getKey().container.getResourceRequestPriority() == allocatedContainer.getPriority().getPriority()) {
alreadyAllocated = false;
csr = entry.getKey();
break;
}
}
if (alreadyAllocated) {
LOG.info("Releasing {} as resource with priority {} was already assigned", allocatedContainer.getId(), allocatedContainer.getPriority());
releasedContainers.add(allocatedContainer.getId());
numReleasedContainers++;
// undo the decrement above for this allocated container
numRequestedContainers++;
continue;
}
if (csr != null) {
requestedResources.remove(csr);
}
// allocate resource to container
ContainerResource resource = new ContainerResource(allocatedContainer.getPriority().getPriority(), allocatedContainer.getId().toString(), allocatedContainer.getNodeId().toString(), allocatedContainer.getResource().getMemory(), allocatedContainer.getResource().getVirtualCores(), allocatedContainer.getNodeHttpAddress());
StreamingContainerAgent sca = dnmgr.assignContainer(resource, null);
if (sca == null) {
// allocated container no longer needed, add release request
LOG.warn("Container {} allocated but nothing to deploy, going to release this container.", allocatedContainer.getId());
releasedContainers.add(allocatedContainer.getId());
} else {
AllocatedContainer allocatedContainerHolder = new AllocatedContainer(allocatedContainer);
this.allocatedContainers.put(allocatedContainer.getId().toString(), allocatedContainerHolder);
ByteBuffer tokens = null;
if (UserGroupInformation.isSecurityEnabled()) {
UserGroupInformation ugi = UserGroupInformation.getLoginUser();
Token<StramDelegationTokenIdentifier> delegationToken = allocateDelegationToken(ugi.getUserName(), heartbeatListener.getAddress());
allocatedContainerHolder.delegationToken = delegationToken;
// ByteBuffer tokens = LaunchContainerRunnable.getTokens(delegationTokenManager, heartbeatListener.getAddress());
tokens = LaunchContainerRunnable.getTokens(ugi, delegationToken);
}
LaunchContainerRunnable launchContainer = new LaunchContainerRunnable(allocatedContainer, nmClient, sca, tokens);
// Thread launchThread = new Thread(runnableLaunchContainer);
// launchThreads.add(launchThread);
// launchThread.start();
// communication with NMs is now async
launchContainer.run();
// record container start event
StramEvent ev = new StramEvent.StartContainerEvent(allocatedContainer.getId().toString(), allocatedContainer.getNodeId().toString(), groupingManager.getEventGroupIdForAffectedContainer(allocatedContainer.getId().toString()));
ev.setTimestamp(timestamp);
dnmgr.recordEventAsync(ev);
}
}
// track node updates for future locality constraint allocations
// TODO: it seems 2.0.4-alpha doesn't give us any updates
resourceRequestor.updateNodeReports(amResp.getUpdatedNodes());
// Check the completed containers
List<ContainerStatus> completedContainers = amResp.getCompletedContainersStatuses();
// LOG.debug("Got response from RM for container ask, completedCnt=" + completedContainers.size());
List<String> blacklistAdditions = new ArrayList<>();
for (ContainerStatus containerStatus : completedContainers) {
LOG.info("Completed containerId=" + containerStatus.getContainerId() + ", state=" + containerStatus.getState() + ", exitStatus=" + containerStatus.getExitStatus() + ", diagnostics=" + containerStatus.getDiagnostics());
// non complete containers should not be here
assert (containerStatus.getState() == ContainerState.COMPLETE);
AllocatedContainer allocatedContainer = allocatedContainers.remove(containerStatus.getContainerId().toString());
if (allocatedContainer != null && allocatedContainer.delegationToken != null) {
UserGroupInformation ugi = UserGroupInformation.getLoginUser();
delegationTokenManager.cancelToken(allocatedContainer.delegationToken, ugi.getUserName());
}
EventGroupId groupId = null;
int exitStatus = containerStatus.getExitStatus();
if (0 != exitStatus) {
if (allocatedContainer != null) {
numFailedContainers.incrementAndGet();
if (exitStatus != 1 && maxConsecutiveContainerFailures != Integer.MAX_VALUE) {
// If container failure due to framework
String hostname = allocatedContainer.container.getNodeId().getHost();
if (!failedBlackListedNodes.contains(hostname)) {
// Blacklist the node if not already blacklisted
if (failedContainerNodesMap.containsKey(hostname)) {
NodeFailureStats stats = failedContainerNodesMap.get(hostname);
long timeStamp = System.currentTimeMillis();
if (timeStamp - stats.lastFailureTimeStamp >= blacklistRemovalTime) {
// Reset failure count if last failure was before Blacklist removal time
stats.failureCount = 1;
stats.lastFailureTimeStamp = timeStamp;
} else {
stats.lastFailureTimeStamp = timeStamp;
stats.failureCount++;
if (stats.failureCount >= maxConsecutiveContainerFailures) {
LOG.info("Node {} failed {} times consecutively within {} minutes, marking the node blacklisted", hostname, stats.failureCount, blacklistRemovalTime / (60 * 1000));
blacklistAdditions.add(hostname);
failedBlackListedNodes.add(hostname);
}
}
} else {
failedContainerNodesMap.put(hostname, new NodeFailureStats(System.currentTimeMillis(), 1));
}
}
}
}
// if (exitStatus == 1) {
// // non-recoverable StreamingContainer failure
// appDone = true;
// finalStatus = FinalApplicationStatus.FAILED;
// dnmgr.shutdownDiagnosticsMessage = "Unrecoverable failure " + containerStatus.getContainerId();
// LOG.info("Exiting due to: {}", dnmgr.shutdownDiagnosticsMessage);
// }
// else {
// Recoverable failure or process killed (externally or via stop request by AM)
// also occurs when a container was released by the application but never assigned/launched
LOG.debug("Container {} failed or killed.", containerStatus.getContainerId());
String containerIdStr = containerStatus.getContainerId().toString();
dnmgr.scheduleContainerRestart(containerIdStr);
groupId = groupingManager.getEventGroupIdForAffectedContainer(containerIdStr);
// }
} else {
// container completed successfully
numCompletedContainers.incrementAndGet();
LOG.info("Container completed successfully." + ", containerId=" + containerStatus.getContainerId());
// Reset counter for node failure, if exists
String hostname = allocatedContainer.container.getNodeId().getHost();
NodeFailureStats stats = failedContainerNodesMap.get(hostname);
if (stats != null) {
stats.failureCount = 0;
}
}
String containerIdStr = containerStatus.getContainerId().toString();
dnmgr.removeContainerAgent(containerIdStr);
// record container stop event
StramEvent ev = new StramEvent.StopContainerEvent(containerIdStr, containerStatus.getExitStatus(), groupId);
ev.setReason(containerStatus.getDiagnostics());
dnmgr.recordEventAsync(ev);
}
if (!blacklistAdditions.isEmpty()) {
amRmClient.updateBlacklist(blacklistAdditions, null);
long timeStamp = System.currentTimeMillis();
for (String hostname : blacklistAdditions) {
NodeFailureStats stats = failedContainerNodesMap.get(hostname);
stats.blackListAdditionTime = timeStamp;
}
}
if (dnmgr.forcedShutdown) {
LOG.info("Forced shutdown due to {}", dnmgr.shutdownDiagnosticsMessage);
finalStatus = FinalApplicationStatus.FAILED;
appDone = true;
} else if (allocatedContainers.isEmpty() && numRequestedContainers == 0 && dnmgr.containerStartRequests.isEmpty()) {
LOG.debug("Exiting as no more containers are allocated or requested");
finalStatus = FinalApplicationStatus.SUCCEEDED;
appDone = true;
}
LOG.debug("Current application state: loop={}, appDone={}, requested={}, released={}, completed={}, failed={}, currentAllocated={}, dnmgr.containerStartRequests={}", loopCounter, appDone, numRequestedContainers, numReleasedContainers, numCompletedContainers, numFailedContainers, allocatedContainers.size(), dnmgr.containerStartRequests);
// monitor child containers
dnmgr.monitorHeartbeat(waitForRecovery > 0);
waitForRecovery = Math.max(waitForRecovery - 1, 0);
}
finishApplication(finalStatus);
}
}
use of com.datatorrent.stram.api.StramEvent in project apex-core by apache.
the class StreamingContainerManager method removeContainerAgent.
public void removeContainerAgent(String containerId) {
LOG.debug("Removing container agent {}", containerId);
StreamingContainerAgent containerAgent = containers.remove(containerId);
if (containerAgent != null) {
// record operator stop for this container
for (PTOperator oper : containerAgent.container.getOperators()) {
StramEvent ev = new StramEvent.StopOperatorEvent(oper.getName(), oper.getId(), containerId, groupingManager.getEventGroupIdForContainer(containerId));
recordEventAsync(ev);
}
containerAgent.container.setFinishedTime(System.currentTimeMillis());
containerAgent.container.setState(PTContainer.State.KILLED);
completedContainers.put(containerId, containerAgent.getContainerInfo());
}
}
use of com.datatorrent.stram.api.StramEvent in project apex-core by apache.
the class StreamingContainerManager method monitorHeartbeat.
/**
* Check periodically that deployed containers phone home.
* Run from the master main loop (single threaded access).
*/
public void monitorHeartbeat(boolean waitForRecovery) {
long currentTms = clock.getTime();
// look for resource allocation timeout
if (!pendingAllocation.isEmpty()) {
// look for resource allocation timeout
if (lastResourceRequest + plan.getLogicalPlan().getValue(LogicalPlan.RESOURCE_ALLOCATION_TIMEOUT_MILLIS) < currentTms) {
String msg = String.format("Shutdown due to resource allocation timeout (%s ms) waiting for %s containers", currentTms - lastResourceRequest, pendingAllocation.size());
LOG.warn(msg);
for (PTContainer c : pendingAllocation) {
LOG.warn("Waiting for resource: {}m priority: {} {}", c.getRequiredMemoryMB(), c.getResourceRequestPriority(), c);
}
shutdownAllContainers(ShutdownType.ABORT, msg);
this.forcedShutdown = true;
} else {
for (PTContainer c : pendingAllocation) {
LOG.debug("Waiting for resource: {}m {}", c.getRequiredMemoryMB(), c);
}
}
}
// monitor currently deployed containers
for (StreamingContainerAgent sca : containers.values()) {
PTContainer c = sca.container;
if (!pendingAllocation.contains(c) && c.getExternalId() != null) {
if (sca.lastHeartbeatMillis == 0) {
// container allocated but process was either not launched or is not able to phone home
if (currentTms - sca.createdMillis > 2 * this.vars.heartbeatTimeoutMillis) {
LOG.warn("Container {}@{} startup timeout ({} ms).", c.getExternalId(), c.host, currentTms - sca.createdMillis);
containerStopRequests.put(c.getExternalId(), c.getExternalId());
}
} else {
if (currentTms - sca.lastHeartbeatMillis > this.vars.heartbeatTimeoutMillis) {
if (!isApplicationIdle()) {
// Check if the heartbeat for this agent has already been missed to raise the StramEvent only once
if (sca.lastHeartbeatMillis != -1) {
String msg = String.format("Container %s@%s heartbeat timeout (%d%n ms).", c.getExternalId(), c.host, currentTms - sca.lastHeartbeatMillis);
LOG.warn(msg);
StramEvent stramEvent = new StramEvent.ContainerErrorEvent(c.getExternalId(), msg, null, null);
stramEvent.setReason(msg);
recordEventAsync(stramEvent);
sca.lastHeartbeatMillis = -1;
}
// request stop (kill) as process may still be hanging around (would have been detected by Yarn otherwise)
containerStopRequests.put(c.getExternalId(), c.getExternalId());
}
}
}
}
}
// events that may modify the plan
processEvents();
committedWindowId = updateCheckpoints(waitForRecovery);
if (lastCommittedWindowId != committedWindowId) {
apexPluginDispatcher.dispatch(new DAGExecutionEvent.CommitExecutionEvent(committedWindowId));
lastCommittedWindowId = committedWindowId;
}
calculateEndWindowStats();
if (this.vars.enableStatsRecording) {
recordStats(currentTms);
}
}
use of com.datatorrent.stram.api.StramEvent in project apex-core by apache.
the class PhysicalPlan method redoPartitions.
private void redoPartitions(PMapping currentMapping, String note) {
Partitioner<Operator> partitioner = getPartitioner(currentMapping);
if (partitioner == null) {
LOG.warn("No partitioner for {}", currentMapping.logicalOperator);
return;
}
RepartitionContext mainPC = new RepartitionContext(partitioner, currentMapping, 0);
if (mainPC.newPartitions.isEmpty()) {
LOG.warn("Empty partition list after repartition: {}", currentMapping.logicalOperator);
return;
}
int memoryPerPartition = currentMapping.logicalOperator.getValue(OperatorContext.MEMORY_MB);
for (Map.Entry<OutputPortMeta, StreamMeta> stream : currentMapping.logicalOperator.getOutputStreams().entrySet()) {
if (stream.getValue().getLocality() != Locality.THREAD_LOCAL && stream.getValue().getLocality() != Locality.CONTAINER_LOCAL) {
memoryPerPartition += stream.getKey().getValue(PortContext.BUFFER_MEMORY_MB);
}
}
for (OperatorMeta pp : currentMapping.parallelPartitions) {
for (Map.Entry<OutputPortMeta, StreamMeta> stream : pp.getOutputStreams().entrySet()) {
if (stream.getValue().getLocality() != Locality.THREAD_LOCAL && stream.getValue().getLocality() != Locality.CONTAINER_LOCAL) {
memoryPerPartition += stream.getKey().getValue(PortContext.BUFFER_MEMORY_MB);
}
}
memoryPerPartition += pp.getValue(OperatorContext.MEMORY_MB);
}
int requiredMemoryMB = (mainPC.newPartitions.size() - mainPC.currentPartitions.size()) * memoryPerPartition;
if (requiredMemoryMB > availableMemoryMB) {
LOG.warn("Insufficient headroom for repartitioning: available {}m required {}m", availableMemoryMB, requiredMemoryMB);
return;
}
List<Partition<Operator>> addedPartitions = new ArrayList<>();
// determine modifications of partition set, identify affected operator instance(s)
for (Partition<Operator> newPartition : mainPC.newPartitions) {
PTOperator op = mainPC.currentPartitionMap.remove(newPartition);
if (op == null) {
addedPartitions.add(newPartition);
} else {
// check whether mapping was changed
for (DefaultPartition<Operator> pi : mainPC.currentPartitions) {
if (pi == newPartition && pi.isModified()) {
// existing partition changed (operator or partition keys)
// remove/add to update subscribers and state
mainPC.currentPartitionMap.put(newPartition, op);
addedPartitions.add(newPartition);
}
}
}
}
// remaining entries represent deprecated partitions
this.undeployOpers.addAll(mainPC.currentPartitionMap.values());
// downstream dependencies require redeploy, resolve prior to modifying plan
Set<PTOperator> deps = this.getDependents(mainPC.currentPartitionMap.values());
this.undeployOpers.addAll(deps);
// dependencies need redeploy, except operators excluded in remove
this.deployOpers.addAll(deps);
// process parallel partitions before removing operators from the plan
LinkedHashMap<PMapping, RepartitionContext> partitionContexts = Maps.newLinkedHashMap();
Stack<OperatorMeta> parallelPartitions = new Stack<>();
parallelPartitions.addAll(currentMapping.parallelPartitions);
pendingLoop: while (!parallelPartitions.isEmpty()) {
OperatorMeta ppMeta = parallelPartitions.pop();
for (StreamMeta s : ppMeta.getInputStreams().values()) {
if (currentMapping.parallelPartitions.contains(s.getSource().getOperatorMeta()) && parallelPartitions.contains(s.getSource().getOperatorMeta())) {
parallelPartitions.push(ppMeta);
parallelPartitions.remove(s.getSource().getOperatorMeta());
parallelPartitions.push(s.getSource().getOperatorMeta());
continue pendingLoop;
}
}
LOG.debug("Processing parallel partition {}", ppMeta);
PMapping ppm = this.logicalToPTOperator.get(ppMeta);
Partitioner<Operator> ppp = getPartitioner(ppm);
if (ppp == null) {
partitionContexts.put(ppm, null);
} else {
RepartitionContext pc = new RepartitionContext(ppp, ppm, mainPC.newPartitions.size());
if (pc.newPartitions == null) {
throw new IllegalStateException("Partitioner returns null for parallel partition " + ppm.logicalOperator);
}
partitionContexts.put(ppm, pc);
}
}
// plan updates start here, after all changes were identified
// remove obsolete operators first, any freed resources
// can subsequently be used for new/modified partitions
List<PTOperator> copyPartitions = Lists.newArrayList(currentMapping.partitions);
// remove deprecated partitions from plan
for (PTOperator p : mainPC.currentPartitionMap.values()) {
copyPartitions.remove(p);
removePartition(p, currentMapping);
mainPC.operatorIdToPartition.remove(p.getId());
}
currentMapping.partitions = copyPartitions;
// add new operators
for (Partition<Operator> newPartition : addedPartitions) {
PTOperator p = addPTOperator(currentMapping, newPartition, mainPC.minCheckpoint);
mainPC.operatorIdToPartition.put(p.getId(), newPartition);
}
// process parallel partition changes
for (Map.Entry<PMapping, RepartitionContext> e : partitionContexts.entrySet()) {
if (e.getValue() == null) {
// no partitioner, add required operators
for (int i = 0; i < addedPartitions.size(); i++) {
LOG.debug("Automatically adding to parallel partition {}", e.getKey());
// set activation windowId to confirm to upstream checkpoints
addPTOperator(e.getKey(), null, mainPC.minCheckpoint);
}
} else {
RepartitionContext pc = e.getValue();
// track previous parallel partition mapping
Map<Partition<Operator>, Partition<Operator>> prevMapping = Maps.newHashMap();
for (int i = 0; i < mainPC.currentPartitions.size(); i++) {
prevMapping.put(pc.currentPartitions.get(i), mainPC.currentPartitions.get(i));
}
// determine which new partitions match upstream, remaining to be treated as new operator
Map<Partition<Operator>, Partition<Operator>> newMapping = Maps.newHashMap();
Iterator<Partition<Operator>> itMain = mainPC.newPartitions.iterator();
Iterator<Partition<Operator>> itParallel = pc.newPartitions.iterator();
while (itMain.hasNext() && itParallel.hasNext()) {
newMapping.put(itParallel.next(), itMain.next());
}
for (Partition<Operator> newPartition : pc.newPartitions) {
PTOperator op = pc.currentPartitionMap.remove(newPartition);
if (op == null) {
pc.addedPartitions.add(newPartition);
} else if (prevMapping.get(newPartition) != newMapping.get(newPartition)) {
// upstream partitions don't match, remove/add to replace with new operator
pc.currentPartitionMap.put(newPartition, op);
pc.addedPartitions.add(newPartition);
} else {
// check whether mapping was changed - based on DefaultPartition implementation
for (DefaultPartition<Operator> pi : pc.currentPartitions) {
if (pi == newPartition && pi.isModified()) {
// existing partition changed (operator or partition keys)
// remove/add to update subscribers and state
mainPC.currentPartitionMap.put(newPartition, op);
pc.addedPartitions.add(newPartition);
}
}
}
}
if (!pc.currentPartitionMap.isEmpty()) {
// remove obsolete partitions
List<PTOperator> cowPartitions = Lists.newArrayList(e.getKey().partitions);
for (PTOperator p : pc.currentPartitionMap.values()) {
cowPartitions.remove(p);
removePartition(p, e.getKey());
pc.operatorIdToPartition.remove(p.getId());
}
e.getKey().partitions = cowPartitions;
}
// add new partitions
for (Partition<Operator> newPartition : pc.addedPartitions) {
PTOperator oper = addPTOperator(e.getKey(), newPartition, mainPC.minCheckpoint);
pc.operatorIdToPartition.put(oper.getId(), newPartition);
}
getPartitioner(e.getKey()).partitioned(pc.operatorIdToPartition);
}
}
updateStreamMappings(currentMapping);
for (PMapping pp : partitionContexts.keySet()) {
updateStreamMappings(pp);
}
deployChanges();
if (mainPC.currentPartitions.size() != mainPC.newPartitions.size()) {
StramEvent ev = new StramEvent.PartitionEvent(currentMapping.logicalOperator.getName(), mainPC.currentPartitions.size(), mainPC.newPartitions.size());
ev.setReason(note);
this.ctx.recordEventAsync(ev);
}
partitioner.partitioned(mainPC.operatorIdToPartition);
}
use of com.datatorrent.stram.api.StramEvent in project apex-core by apache.
the class PluginTests method testDispatch.
@Test
public void testDispatch() throws InterruptedException {
DebugPlugin debugPlugin = new DebugPlugin();
StaticPluginLocator<? extends DAGExecutionPlugin> locator = new StaticPluginLocator<>(debugPlugin);
ApexPluginDispatcher pluginManager = new DefaultApexPluginDispatcher(locator, new StramTestSupport.TestAppContext(new Attribute.AttributeMap.DefaultAttributeMap()), null, null);
pluginManager.init(new Configuration());
pluginManager.dispatch(new DAGExecutionEvent.StramExecutionEvent(new StramEvent(StramEvent.LogLevel.DEBUG) {
@Override
public String getType() {
return "TestEvent";
}
}));
pluginManager.dispatch(new DAGExecutionEvent.CommitExecutionEvent(1234));
pluginManager.dispatch(new DAGExecutionEvent.HeartbeatExecutionEvent(new StreamingContainerUmbilicalProtocol.ContainerHeartbeat()));
LogicalPlan plan = new LogicalPlan();
pluginManager.dispatch(new ApexPluginDispatcher.DAGChangeEvent(plan));
debugPlugin.waitForEventDelivery(10);
pluginManager.stop();
Assert.assertEquals(1, debugPlugin.getEventCount());
Assert.assertEquals(1, debugPlugin.getHeartbeatCount());
Assert.assertEquals(1, debugPlugin.getCommitCount());
Assert.assertEquals(plan, debugPlugin.getLogicalPlan());
}
Aggregations