Search in sources :

Example 56 with PTOperator

use of com.datatorrent.stram.plan.physical.PTOperator in project apex-core by apache.

the class StreamingContainerManager method aggregateMetrics.

private void aggregateMetrics(long windowId, Map<Integer, EndWindowStats> endWindowStatsMap) {
    Collection<OperatorMeta> logicalOperators = getLogicalPlan().getAllOperators();
    //for backward compatibility
    for (OperatorMeta operatorMeta : logicalOperators) {
        @SuppressWarnings("deprecation") Context.CountersAggregator aggregator = operatorMeta.getValue(OperatorContext.COUNTERS_AGGREGATOR);
        if (aggregator == null) {
            continue;
        }
        Collection<PTOperator> physicalOperators = plan.getAllOperators(operatorMeta);
        List<Object> counters = Lists.newArrayList();
        for (PTOperator operator : physicalOperators) {
            EndWindowStats stats = endWindowStatsMap.get(operator.getId());
            if (stats != null && stats.counters != null) {
                counters.add(stats.counters);
            }
        }
        if (counters.size() > 0) {
            @SuppressWarnings("deprecation") Object aggregate = aggregator.aggregate(counters);
            latestLogicalCounters.put(operatorMeta.getName(), aggregate);
        }
    }
    for (OperatorMeta operatorMeta : logicalOperators) {
        AutoMetric.Aggregator aggregator = operatorMeta.getMetricAggregatorMeta() != null ? operatorMeta.getMetricAggregatorMeta().getAggregator() : null;
        if (aggregator == null) {
            continue;
        }
        Collection<PTOperator> physicalOperators = plan.getAllOperators(operatorMeta);
        List<AutoMetric.PhysicalMetricsContext> metricPool = Lists.newArrayList();
        for (PTOperator operator : physicalOperators) {
            EndWindowStats stats = endWindowStatsMap.get(operator.getId());
            if (stats != null && stats.metrics != null) {
                PhysicalMetricsContextImpl physicalMetrics = new PhysicalMetricsContextImpl(operator.getId(), stats.metrics);
                metricPool.add(physicalMetrics);
            }
        }
        if (metricPool.isEmpty()) {
            //nothing to aggregate
            continue;
        }
        Map<String, Object> lm = aggregator.aggregate(windowId, metricPool);
        if (lm != null && lm.size() > 0) {
            Queue<Pair<Long, Map<String, Object>>> windowMetrics = logicalMetrics.get(operatorMeta.getName());
            if (windowMetrics == null) {
                windowMetrics = new LinkedBlockingQueue<Pair<Long, Map<String, Object>>>(METRIC_QUEUE_SIZE) {

                    private static final long serialVersionUID = 1L;

                    @Override
                    public boolean add(Pair<Long, Map<String, Object>> longMapPair) {
                        if (remainingCapacity() <= 1) {
                            remove();
                        }
                        return super.add(longMapPair);
                    }
                };
                logicalMetrics.put(operatorMeta.getName(), windowMetrics);
            }
            LOG.debug("Adding to logical metrics for {}", operatorMeta.getName());
            windowMetrics.add(new Pair<>(windowId, lm));
            Map<String, Object> oldValue = latestLogicalMetrics.put(operatorMeta.getName(), lm);
            if (oldValue == null) {
                try {
                    saveMetaInfo();
                } catch (IOException ex) {
                    LOG.error("Cannot save application meta info to DFS. App data sources will not be available.", ex);
                }
            }
        }
    }
}
Also used : Pair(com.datatorrent.common.util.Pair) PortContextPair(com.datatorrent.stram.plan.logical.Operators.PortContextPair) FileContext(org.apache.hadoop.fs.FileContext) Context(com.datatorrent.api.Context) ContainerContext(com.datatorrent.stram.api.ContainerContext) StreamingContainerContext(com.datatorrent.stram.api.StreamingContainerUmbilicalProtocol.StreamingContainerContext) OperatorContext(com.datatorrent.api.Context.OperatorContext) PlanContext(com.datatorrent.stram.plan.physical.PhysicalPlan.PlanContext) OperatorMeta(com.datatorrent.stram.plan.logical.LogicalPlan.OperatorMeta) PTOperator(com.datatorrent.stram.plan.physical.PTOperator) IOException(java.io.IOException) AutoMetric(com.datatorrent.api.AutoMetric) MutableLong(org.apache.commons.lang3.mutable.MutableLong) MovingAverageLong(com.datatorrent.stram.util.MovingAverage.MovingAverageLong) AtomicLong(java.util.concurrent.atomic.AtomicLong) JSONObject(org.codehaus.jettison.json.JSONObject) Map(java.util.Map) LinkedHashMap(java.util.LinkedHashMap) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashMap(java.util.HashMap) ConcurrentMap(java.util.concurrent.ConcurrentMap) ConcurrentSkipListMap(java.util.concurrent.ConcurrentSkipListMap)

Example 57 with PTOperator

use of com.datatorrent.stram.plan.physical.PTOperator in project apex-core by apache.

the class StreamingContainerManager method deploy.

@Override
public void deploy(Set<PTContainer> releaseContainers, Collection<PTOperator> undeploy, Set<PTContainer> startContainers, Collection<PTOperator> deploy) {
    try {
        this.deployChangeInProgress.set(true);
        Map<PTContainer, List<PTOperator>> undeployGroups = groupByContainer(undeploy);
        // order does not matter, remove all operators in each container in one sweep
        for (Map.Entry<PTContainer, List<PTOperator>> e : undeployGroups.entrySet()) {
            // container may already be in failed or pending deploy state, notified by RM or timed out
            PTContainer c = e.getKey();
            if (!startContainers.contains(c) && !releaseContainers.contains(c) && c.getState() != PTContainer.State.KILLED) {
                LOG.debug("scheduling undeploy {} {}", e.getKey().getExternalId(), e.getValue());
                for (PTOperator oper : e.getValue()) {
                    oper.setState(PTOperator.State.PENDING_UNDEPLOY);
                }
            }
        }
        // start new containers
        for (PTContainer c : startContainers) {
            requestContainer(c);
        }
        // (re)deploy affected operators
        // can happen in parallel after buffer server for recovered publishers is reset
        Map<PTContainer, List<PTOperator>> deployGroups = groupByContainer(deploy);
        for (Map.Entry<PTContainer, List<PTOperator>> e : deployGroups.entrySet()) {
            if (!startContainers.contains(e.getKey())) {
                // to reset publishers, clean buffer server past checkpoint so subscribers don't read stale data (including end of stream)
                for (PTOperator operator : e.getValue()) {
                    for (PTOperator.PTOutput out : operator.getOutputs()) {
                        if (!out.isDownStreamInline()) {
                            for (InputPortMeta ipm : out.logicalStream.getSinks()) {
                                StreamCodec<?> streamCodec = ipm.getStreamCodec();
                                Integer codecId = plan.getStreamCodecIdentifier(streamCodec);
                                // following needs to match the concat logic in StreamingContainer
                                String sourceIdentifier = Integer.toString(operator.getId()).concat(Component.CONCAT_SEPARATOR).concat(out.portName).concat(Component.CONCAT_SEPARATOR).concat(codecId.toString());
                                if (operator.getContainer().getState() == PTContainer.State.ACTIVE) {
                                    // TODO: unit test - find way to mock this when testing rest of logic
                                    if (operator.getContainer().bufferServerAddress.getPort() != 0) {
                                        BufferServerController bsc = getBufferServerClient(operator);
                                        // ensures new subscriber starting to read from checkpoint will wait until publisher redeploy cycle is complete
                                        try {
                                            bsc.reset(null, sourceIdentifier, 0);
                                        } catch (Exception ex) {
                                            LOG.error("Failed to reset buffer server {} {}", sourceIdentifier, ex);
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
            // add to operators that we expect to deploy
            LOG.debug("scheduling deploy {} {}", e.getKey().getExternalId(), e.getValue());
            for (PTOperator oper : e.getValue()) {
                // operator will be deployed after it has been undeployed, if still referenced by the container
                if (oper.getState() != PTOperator.State.PENDING_UNDEPLOY) {
                    oper.setState(PTOperator.State.PENDING_DEPLOY);
                }
            }
        }
        // stop containers that are no longer used
        for (PTContainer c : releaseContainers) {
            if (c.getExternalId() == null) {
                continue;
            }
            StreamingContainerAgent sca = containers.get(c.getExternalId());
            if (sca != null) {
                LOG.debug("Container marked for shutdown: {}", c);
                // container already removed from plan
                // TODO: monitor soft shutdown
                sca.requestShutDown(ShutdownType.ABORT);
            }
        }
    } finally {
        this.deployChangeCnt++;
        this.deployChangeInProgress.set(false);
    }
}
Also used : PTOperator(com.datatorrent.stram.plan.physical.PTOperator) InputPortMeta(com.datatorrent.stram.plan.logical.LogicalPlan.InputPortMeta) NotFoundException(org.apache.hadoop.yarn.webapp.NotFoundException) IOException(java.io.IOException) JSONException(org.codehaus.jettison.json.JSONException) KryoException(com.esotericsoftware.kryo.KryoException) PTOutput(com.datatorrent.stram.plan.physical.PTOperator.PTOutput) PTContainer(com.datatorrent.stram.plan.physical.PTContainer) ArrayList(java.util.ArrayList) List(java.util.List) LinkedList(java.util.LinkedList) Map(java.util.Map) LinkedHashMap(java.util.LinkedHashMap) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashMap(java.util.HashMap) ConcurrentMap(java.util.concurrent.ConcurrentMap) ConcurrentSkipListMap(java.util.concurrent.ConcurrentSkipListMap)

Example 58 with PTOperator

use of com.datatorrent.stram.plan.physical.PTOperator in project apex-core by apache.

the class StreamingContainerManager method updateOperatorLatency.

public long updateOperatorLatency(PTOperator oper, UpdateOperatorLatencyContext ctx) {
    if (!oper.getInputs().isEmpty() && oper.stats.currentWindowId.get() > 0) {
        OperatorStatus status = oper.stats;
        long latency = Long.MAX_VALUE;
        PTOperator slowestUpstream = null;
        int windowWidthMillis = plan.getLogicalPlan().getValue(LogicalPlan.STREAMING_WINDOW_SIZE_MILLIS);
        int heartbeatTimeoutMillis = plan.getLogicalPlan().getValue(LogicalPlan.HEARTBEAT_TIMEOUT_MILLIS);
        long currentWindowId = status.currentWindowId.get();
        if (!ctx.endWindowStatsExists(currentWindowId)) {
            // the end window stats for the current window id is not available, estimate latency by looking at upstream window id
            for (PTInput input : oper.getInputs()) {
                PTOperator upstreamOp = input.source.source;
                if (upstreamOp.getOperatorMeta().getOperator() instanceof Operator.DelayOperator) {
                    continue;
                }
                if (upstreamOp.stats.currentWindowId.get() >= oper.stats.currentWindowId.get()) {
                    long portLatency = WindowGenerator.compareWindowId(upstreamOp.stats.currentWindowId.get(), oper.stats.currentWindowId.get(), windowWidthMillis) * windowWidthMillis;
                    if (latency > portLatency) {
                        latency = portLatency;
                        slowestUpstream = upstreamOp;
                    }
                }
            }
        } else {
            long endWindowEmitTime = ctx.getEndWindowEmitTimestamp(currentWindowId, oper);
            long adjustedEndWindowEmitTimestamp = endWindowEmitTime + ctx.getRPCLatency(oper);
            for (PTInput input : oper.getInputs()) {
                PTOperator upstreamOp = input.source.source;
                if (upstreamOp.getOperatorMeta().getOperator() instanceof Operator.DelayOperator) {
                    continue;
                }
                long upstreamEndWindowEmitTime = ctx.getEndWindowEmitTimestamp(currentWindowId, upstreamOp);
                if (upstreamEndWindowEmitTime < 0) {
                    continue;
                }
                long portLatency = adjustedEndWindowEmitTimestamp - (upstreamEndWindowEmitTime + ctx.getRPCLatency(upstreamOp));
                if (portLatency < 0) {
                    portLatency = 0;
                }
                long latencyFromWindowsBehind = WindowGenerator.compareWindowId(upstreamOp.stats.currentWindowId.get(), oper.stats.currentWindowId.get(), windowWidthMillis) * windowWidthMillis;
                if (latencyFromWindowsBehind > portLatency && latencyFromWindowsBehind > heartbeatTimeoutMillis) {
                    portLatency = latencyFromWindowsBehind;
                }
                if (latency > portLatency) {
                    latency = portLatency;
                    slowestUpstream = upstreamOp;
                }
            }
        }
        if (slowestUpstream != null) {
            status.latencyMA.add(latency);
            slowestUpstreamOp.put(oper, slowestUpstream);
            return latency;
        }
    }
    return -1;
}
Also used : PTOperator(com.datatorrent.stram.plan.physical.PTOperator) PTInput(com.datatorrent.stram.plan.physical.PTOperator.PTInput) OperatorStatus(com.datatorrent.stram.plan.physical.OperatorStatus) LogicalOperatorStatus(com.datatorrent.stram.plan.logical.LogicalOperatorStatus) Checkpoint(com.datatorrent.stram.api.Checkpoint)

Example 59 with PTOperator

use of com.datatorrent.stram.plan.physical.PTOperator in project apex-core by apache.

the class StreamingContainerManager method purgeCheckpoints.

private void purgeCheckpoints() {
    for (Pair<PTOperator, Long> p : purgeCheckpoints) {
        final PTOperator operator = p.getFirst();
        if (!operator.isOperatorStateLess()) {
            final long windowId = p.getSecond();
            Runnable r = new Runnable() {

                @Override
                public void run() {
                    try {
                        operator.getOperatorMeta().getValue(OperatorContext.STORAGE_AGENT).delete(operator.getId(), windowId);
                    } catch (IOException ex) {
                        LOG.error("Failed to purge checkpoint for operator {} for windowId {}", operator, windowId, ex);
                    }
                }
            };
            poolExecutor.submit(r);
        }
    }
    purgeCheckpoints.clear();
}
Also used : PTOperator(com.datatorrent.stram.plan.physical.PTOperator) MutableLong(org.apache.commons.lang3.mutable.MutableLong) MovingAverageLong(com.datatorrent.stram.util.MovingAverage.MovingAverageLong) AtomicLong(java.util.concurrent.atomic.AtomicLong) IOException(java.io.IOException)

Example 60 with PTOperator

use of com.datatorrent.stram.plan.physical.PTOperator in project apex-core by apache.

the class StreamingContainerManager method fillLogicalOperatorInfo.

private LogicalOperatorInfo fillLogicalOperatorInfo(OperatorMeta operator) {
    LogicalOperatorInfo loi = new LogicalOperatorInfo();
    loi.name = operator.getName();
    loi.className = operator.getOperator().getClass().getName();
    loi.totalTuplesEmitted = operator.getStatus().totalTuplesEmitted;
    loi.totalTuplesProcessed = operator.getStatus().totalTuplesProcessed;
    loi.failureCount = operator.getStatus().failureCount;
    loi.status = new HashMap<>();
    loi.partitions = new TreeSet<>();
    loi.unifiers = new TreeSet<>();
    loi.containerIds = new TreeSet<>();
    loi.hosts = new TreeSet<>();
    Collection<PTOperator> physicalOperators = getPhysicalPlan().getAllOperators(operator);
    NumberAggregate.LongAggregate checkpointTimeAggregate = new NumberAggregate.LongAggregate();
    for (PTOperator physicalOperator : physicalOperators) {
        OperatorStatus os = physicalOperator.stats;
        if (physicalOperator.isUnifier()) {
            loi.unifiers.add(physicalOperator.getId());
        } else {
            loi.partitions.add(physicalOperator.getId());
            // exclude unifier, not sure if we should include it in the future
            loi.tuplesEmittedPSMA += os.tuplesEmittedPSMA.get();
            loi.tuplesProcessedPSMA += os.tuplesProcessedPSMA.get();
            // calculate maximum latency for all partitions
            long latency = calculateLatency(physicalOperator);
            if (latency > loi.latencyMA) {
                loi.latencyMA = latency;
            }
            checkpointTimeAggregate.addNumber(os.checkpointTimeMA.getAvg());
        }
        loi.cpuPercentageMA += os.cpuNanosPMSMA.getAvg() / 10000;
        if (os.lastHeartbeat != null && (loi.lastHeartbeat == 0 || loi.lastHeartbeat > os.lastHeartbeat.getGeneratedTms())) {
            loi.lastHeartbeat = os.lastHeartbeat.getGeneratedTms();
        }
        long currentWindowId = toWsWindowId(os.currentWindowId.get());
        if (loi.currentWindowId == 0 || loi.currentWindowId > currentWindowId) {
            loi.currentWindowId = currentWindowId;
        }
        MutableInt count = loi.status.get(physicalOperator.getState().toString());
        if (count == null) {
            count = new MutableInt();
            loi.status.put(physicalOperator.getState().toString(), count);
        }
        count.increment();
        if (physicalOperator.getRecoveryCheckpoint() != null) {
            long recoveryWindowId = toWsWindowId(physicalOperator.getRecoveryCheckpoint().windowId);
            if (loi.recoveryWindowId == 0 || loi.recoveryWindowId > recoveryWindowId) {
                loi.recoveryWindowId = recoveryWindowId;
            }
        }
        PTContainer container = physicalOperator.getContainer();
        if (container != null) {
            String externalId = container.getExternalId();
            if (externalId != null) {
                loi.containerIds.add(externalId);
                loi.hosts.add(container.host);
            }
        }
    }
    if (physicalOperators.size() > 0 && checkpointTimeAggregate.getAvg() != null) {
        loi.checkpointTimeMA = checkpointTimeAggregate.getAvg().longValue();
        loi.counters = latestLogicalCounters.get(operator.getName());
        loi.autoMetrics = latestLogicalMetrics.get(operator.getName());
    }
    return loi;
}
Also used : NumberAggregate(com.datatorrent.common.util.NumberAggregate) LogicalOperatorInfo(com.datatorrent.stram.webapp.LogicalOperatorInfo) PTOperator(com.datatorrent.stram.plan.physical.PTOperator) OperatorStatus(com.datatorrent.stram.plan.physical.OperatorStatus) LogicalOperatorStatus(com.datatorrent.stram.plan.logical.LogicalOperatorStatus) MutableInt(org.apache.commons.lang3.mutable.MutableInt) PTContainer(com.datatorrent.stram.plan.physical.PTContainer)

Aggregations

PTOperator (com.datatorrent.stram.plan.physical.PTOperator)84 Test (org.junit.Test)39 PhysicalPlan (com.datatorrent.stram.plan.physical.PhysicalPlan)38 GenericTestOperator (com.datatorrent.stram.engine.GenericTestOperator)36 PTContainer (com.datatorrent.stram.plan.physical.PTContainer)34 Checkpoint (com.datatorrent.stram.api.Checkpoint)23 LogicalPlan (com.datatorrent.stram.plan.logical.LogicalPlan)22 MemoryStorageAgent (com.datatorrent.stram.support.StramTestSupport.MemoryStorageAgent)16 OperatorDeployInfo (com.datatorrent.stram.api.OperatorDeployInfo)15 OperatorMeta (com.datatorrent.stram.plan.logical.LogicalPlan.OperatorMeta)15 PhysicalPlanTest (com.datatorrent.stram.plan.physical.PhysicalPlanTest)14 TestGeneratorInputOperator (com.datatorrent.stram.engine.TestGeneratorInputOperator)11 ArrayList (java.util.ArrayList)10 HashMap (java.util.HashMap)10 AsyncFSStorageAgent (com.datatorrent.common.util.AsyncFSStorageAgent)9 StramTestSupport (com.datatorrent.stram.support.StramTestSupport)9 Map (java.util.Map)9 TestPlanContext (com.datatorrent.stram.plan.TestPlanContext)7 Operator (com.datatorrent.api.Operator)6 StatsListener (com.datatorrent.api.StatsListener)6