Search in sources :

Example 6 with InstanceMetrics

use of com.microsoft.dhalion.metrics.InstanceMetrics in project incubator-heron by apache.

the class SlowInstanceDiagnoser method diagnose.

@Override
public Diagnosis diagnose(List<Symptom> symptoms) {
    List<Symptom> bpSymptoms = getBackPressureSymptoms(symptoms);
    Map<String, ComponentMetrics> processingRateSkewComponents = getProcessingRateSkewComponents(symptoms);
    Map<String, ComponentMetrics> waitQDisparityComponents = getWaitQDisparityComponents(symptoms);
    if (bpSymptoms.isEmpty() || waitQDisparityComponents.isEmpty() || !processingRateSkewComponents.isEmpty()) {
        // execution count, no action is needed
        return null;
    } else if (bpSymptoms.size() > 1) {
        // TODO handle cases where multiple detectors create back pressure symptom
        throw new IllegalStateException("Multiple back-pressure symptoms case");
    }
    ComponentMetrics bpMetrics = bpSymptoms.iterator().next().getComponent();
    // verify wait Q disparity and back pressure for the same component exists
    ComponentMetrics pendingBufferMetrics = waitQDisparityComponents.get(bpMetrics.getName());
    if (pendingBufferMetrics == null) {
        // no wait Q disparity for the component with back pressure. There is no slow instance
        return null;
    }
    ComponentMetrics mergedData = ComponentMetrics.merge(bpMetrics, pendingBufferMetrics);
    ComponentMetricsHelper compStats = new ComponentMetricsHelper(mergedData);
    compStats.computeBpStats();
    MetricsStats bufferStats = compStats.computeMinMaxStats(METRIC_BUFFER_SIZE);
    Symptom resultSymptom = null;
    for (InstanceMetrics boltMetrics : compStats.getBoltsWithBackpressure()) {
        double bufferSize = boltMetrics.getMetricValueSum(METRIC_BUFFER_SIZE.text());
        double bpValue = boltMetrics.getMetricValueSum(METRIC_BACK_PRESSURE.text());
        if (bufferStats.getMetricMax() < bufferSize * 2) {
            LOG.info(String.format("SLOW: %s back-pressure(%s) and high buffer size: %s " + "and similar processing rates", boltMetrics.getName(), bpValue, bufferSize));
            resultSymptom = new Symptom(SYMPTOM_SLOW_INSTANCE.text(), mergedData);
        }
    }
    return resultSymptom != null ? new Diagnosis(DIAGNOSIS_SLOW_INSTANCE.text(), resultSymptom) : null;
}
Also used : InstanceMetrics(com.microsoft.dhalion.metrics.InstanceMetrics) ComponentMetricsHelper(com.twitter.heron.healthmgr.common.ComponentMetricsHelper) Diagnosis(com.microsoft.dhalion.diagnoser.Diagnosis) Symptom(com.microsoft.dhalion.detector.Symptom) ComponentMetrics(com.microsoft.dhalion.metrics.ComponentMetrics) MetricsStats(com.twitter.heron.healthmgr.common.MetricsStats)

Example 7 with InstanceMetrics

use of com.microsoft.dhalion.metrics.InstanceMetrics in project incubator-heron by apache.

the class BufferSizeSensor method get.

/**
 * The buffer size as provided by tracker
 *
 * @return buffer size
 */
public Map<String, ComponentMetrics> get(String... desiredBoltNames) {
    Map<String, ComponentMetrics> result = new HashMap<>();
    Set<String> boltNameFilter = new HashSet<>();
    if (desiredBoltNames.length > 0) {
        boltNameFilter.addAll(Arrays.asList(desiredBoltNames));
    }
    String[] boltComponents = topologyProvider.getBoltNames();
    for (String boltComponent : boltComponents) {
        if (!boltNameFilter.isEmpty() && !boltNameFilter.contains(boltComponent)) {
            continue;
        }
        String[] boltInstanceNames = packingPlanProvider.getBoltInstanceNames(boltComponent);
        Map<String, InstanceMetrics> instanceMetrics = new HashMap<>();
        for (String boltInstanceName : boltInstanceNames) {
            String metric = getMetricName() + boltInstanceName + MetricName.METRIC_BUFFER_SIZE_SUFFIX;
            Map<String, ComponentMetrics> stmgrResult = metricsProvider.getComponentMetrics(metric, getDuration(), COMPONENT_STMGR);
            if (stmgrResult.get(COMPONENT_STMGR) == null) {
                continue;
            }
            HashMap<String, InstanceMetrics> streamManagerResult = stmgrResult.get(COMPONENT_STMGR).getMetrics();
            if (streamManagerResult.isEmpty()) {
                continue;
            }
            // since a bolt instance belongs to one stream manager, expect just one metrics
            // manager instance in the result
            Double stmgrInstanceResult = 0.0;
            for (Iterator<InstanceMetrics> it = streamManagerResult.values().iterator(); it.hasNext(); ) {
                InstanceMetrics iMetrics = it.next();
                Double val = iMetrics.getMetricValueSum(metric);
                if (val == null) {
                    continue;
                } else {
                    stmgrInstanceResult += val;
                }
            }
            InstanceMetrics boltInstanceMetric = new InstanceMetrics(boltInstanceName, getMetricName(), stmgrInstanceResult);
            instanceMetrics.put(boltInstanceName, boltInstanceMetric);
        }
        ComponentMetrics componentMetrics = new ComponentMetrics(boltComponent, instanceMetrics);
        result.put(boltComponent, componentMetrics);
    }
    return result;
}
Also used : InstanceMetrics(com.microsoft.dhalion.metrics.InstanceMetrics) HashMap(java.util.HashMap) ComponentMetrics(com.microsoft.dhalion.metrics.ComponentMetrics) HashSet(java.util.HashSet)

Example 8 with InstanceMetrics

use of com.microsoft.dhalion.metrics.InstanceMetrics in project incubator-heron by apache.

the class MetricsCacheMetricsProvider method getComponentMetrics.

@Override
public Map<String, ComponentMetrics> getComponentMetrics(String metric, Instant startTime, Duration duration, String... components) {
    Map<String, ComponentMetrics> result = new HashMap<>();
    for (String component : components) {
        TopologyMaster.MetricResponse response = getMetricsFromMetricsCache(metric, component, startTime, duration);
        Map<String, InstanceMetrics> metrics = parse(response, component, metric, startTime);
        ComponentMetrics componentMetric = new ComponentMetrics(component, metrics);
        result.put(component, componentMetric);
    }
    return result;
}
Also used : InstanceMetrics(com.microsoft.dhalion.metrics.InstanceMetrics) HashMap(java.util.HashMap) TopologyMaster(com.twitter.heron.proto.tmaster.TopologyMaster) ComponentMetrics(com.microsoft.dhalion.metrics.ComponentMetrics)

Example 9 with InstanceMetrics

use of com.microsoft.dhalion.metrics.InstanceMetrics in project incubator-heron by apache.

the class RestartContainerResolver method resolve.

@Override
public List<Action> resolve(List<Diagnosis> diagnosis) {
    List<Action> actions = new ArrayList<>();
    for (Diagnosis diagnoses : diagnosis) {
        Symptom bpSymptom = diagnoses.getSymptoms().get(SYMPTOM_SLOW_INSTANCE.text());
        if (bpSymptom == null || bpSymptom.getComponents().isEmpty()) {
            // nothing to fix as there is no back pressure
            continue;
        }
        if (bpSymptom.getComponents().size() > 1) {
            throw new UnsupportedOperationException("Multiple components with back pressure symptom");
        }
        // want to know which stmgr has backpressure
        String stmgrId = null;
        for (InstanceMetrics im : bpSymptom.getComponent().getMetrics().values()) {
            if (im.hasMetricAboveLimit(METRIC_BACK_PRESSURE.text(), noiseFilterMillis)) {
                String instanceId = im.getName();
                int fromIndex = instanceId.indexOf('_') + 1;
                int toIndex = instanceId.indexOf('_', fromIndex);
                stmgrId = instanceId.substring(fromIndex, toIndex);
                break;
            }
        }
        LOG.info("Restarting container: " + stmgrId);
        boolean b = schedulerClient.restartTopology(RestartTopologyRequest.newBuilder().setContainerIndex(Integer.valueOf(stmgrId)).setTopologyName(topologyName).build());
        LOG.info("Restarted container result: " + b);
        ContainerRestart action = new ContainerRestart();
        LOG.info("Broadcasting container restart event");
        eventManager.onEvent(action);
        actions.add(action);
        return actions;
    }
    return actions;
}
Also used : InstanceMetrics(com.microsoft.dhalion.metrics.InstanceMetrics) Action(com.microsoft.dhalion.resolver.Action) ArrayList(java.util.ArrayList) Diagnosis(com.microsoft.dhalion.diagnoser.Diagnosis) Symptom(com.microsoft.dhalion.detector.Symptom) ContainerRestart(com.twitter.heron.healthmgr.common.HealthManagerEvents.ContainerRestart)

Example 10 with InstanceMetrics

use of com.microsoft.dhalion.metrics.InstanceMetrics in project incubator-heron by apache.

the class BackPressureSensor method get.

/**
 * Computes the average (millis/sec) back-pressure caused by instances in the configured window
 *
 * @return the average value
 */
public Map<String, ComponentMetrics> get() {
    Map<String, ComponentMetrics> result = new HashMap<>();
    String[] boltComponents = topologyProvider.getBoltNames();
    for (String boltComponent : boltComponents) {
        String[] boltInstanceNames = packingPlanProvider.getBoltInstanceNames(boltComponent);
        Duration duration = getDuration();
        Map<String, InstanceMetrics> instanceMetrics = new HashMap<>();
        for (String boltInstanceName : boltInstanceNames) {
            String metric = getMetricName() + boltInstanceName;
            Map<String, ComponentMetrics> stmgrResult = metricsProvider.getComponentMetrics(metric, duration, COMPONENT_STMGR);
            if (stmgrResult.get(COMPONENT_STMGR) == null) {
                continue;
            }
            HashMap<String, InstanceMetrics> streamManagerResult = stmgrResult.get(COMPONENT_STMGR).getMetrics();
            if (streamManagerResult.isEmpty()) {
                continue;
            }
            // since a bolt instance belongs to one stream manager,
            // for tracker rest api: expect just one metrics manager instance in the result;
            // for tmaster/metricscache stat interface: expect a list
            Double valueSum = 0.0;
            for (Iterator<InstanceMetrics> it = streamManagerResult.values().iterator(); it.hasNext(); ) {
                InstanceMetrics stmgrInstanceResult = it.next();
                Double val = stmgrInstanceResult.getMetricValueSum(metric);
                if (val == null) {
                    continue;
                } else {
                    valueSum += val;
                }
            }
            double averageBp = valueSum / duration.getSeconds();
            // The maximum value of averageBp should be 1000, i.e. 1000 millis of BP per second. Due to
            // a bug in Heron (Issue: 1753), this value could be higher in some cases. The following
            // check partially corrects the reported BP value
            averageBp = averageBp > 1000 ? 1000 : averageBp;
            InstanceMetrics boltInstanceMetric = new InstanceMetrics(boltInstanceName, getMetricName(), averageBp);
            instanceMetrics.put(boltInstanceName, boltInstanceMetric);
        }
        ComponentMetrics componentMetrics = new ComponentMetrics(boltComponent, instanceMetrics);
        result.put(boltComponent, componentMetrics);
    }
    return result;
}
Also used : InstanceMetrics(com.microsoft.dhalion.metrics.InstanceMetrics) HashMap(java.util.HashMap) Duration(java.time.Duration) ComponentMetrics(com.microsoft.dhalion.metrics.ComponentMetrics)

Aggregations

InstanceMetrics (com.microsoft.dhalion.metrics.InstanceMetrics)24 ComponentMetrics (com.microsoft.dhalion.metrics.ComponentMetrics)15 HashMap (java.util.HashMap)11 Test (org.junit.Test)8 Symptom (com.microsoft.dhalion.detector.Symptom)7 Instant (java.time.Instant)7 HealthPolicyConfig (com.twitter.heron.healthmgr.HealthPolicyConfig)4 Diagnosis (com.microsoft.dhalion.diagnoser.Diagnosis)3 VisibleForTesting (com.google.common.annotations.VisibleForTesting)2 ComponentMetricsHelper (com.twitter.heron.healthmgr.common.ComponentMetricsHelper)2 MetricsStats (com.twitter.heron.healthmgr.common.MetricsStats)2 BufferSizeSensor (com.twitter.heron.healthmgr.sensors.BufferSizeSensor)2 ExecuteCountSensor (com.twitter.heron.healthmgr.sensors.ExecuteCountSensor)2 TopologyMaster (com.twitter.heron.proto.tmaster.TopologyMaster)2 DocumentContext (com.jayway.jsonpath.DocumentContext)1 Action (com.microsoft.dhalion.resolver.Action)1 ContainerRestart (com.twitter.heron.healthmgr.common.HealthManagerEvents.ContainerRestart)1 MetricInterval (com.twitter.heron.proto.tmaster.TopologyMaster.MetricInterval)1 IndividualMetric (com.twitter.heron.proto.tmaster.TopologyMaster.MetricResponse.IndividualMetric)1 IntervalValue (com.twitter.heron.proto.tmaster.TopologyMaster.MetricResponse.IndividualMetric.IntervalValue)1