Search in sources :

Example 6 with Diagnosis

use of com.microsoft.dhalion.diagnoser.Diagnosis in project incubator-heron by apache.

the class DataSkewDiagnoserTest method diagnosisNoDataSkewLowRate.

@Test
public void diagnosisNoDataSkewLowRate() {
    List<Symptom> symptoms = TestUtils.createBpSymptomList(123, 0, 0);
    symptoms.add(TestUtils.createExeCountSymptom(100, 2000, 2000));
    symptoms.add(TestUtils.createWaitQueueDisparitySymptom(10000, 500, 500));
    Diagnosis result = new DataSkewDiagnoser().diagnose(symptoms);
    assertNull(result);
}
Also used : Diagnosis(com.microsoft.dhalion.diagnoser.Diagnosis) Symptom(com.microsoft.dhalion.detector.Symptom) Test(org.junit.Test)

Example 7 with Diagnosis

use of com.microsoft.dhalion.diagnoser.Diagnosis in project incubator-heron by apache.

the class DataSkewDiagnoserTest method diagnosis1DataSkewInstance.

@Test
public void diagnosis1DataSkewInstance() {
    List<Symptom> symptoms = TestUtils.createBpSymptomList(123, 0, 0);
    symptoms.add(TestUtils.createExeCountSymptom(5000, 2000, 2000));
    symptoms.add(TestUtils.createWaitQueueDisparitySymptom(10000, 500, 500));
    Diagnosis result = new DataSkewDiagnoser().diagnose(symptoms);
    assertNotNull(result);
    assertEquals(DIAGNOSIS_DATA_SKEW.text(), result.getName());
    assertEquals(1, result.getSymptoms().size());
    Symptom symptom = result.getSymptoms().values().iterator().next();
    assertEquals(123, symptom.getComponent().getMetricValueSum("container_1_bolt_0", METRIC_BACK_PRESSURE.text()).intValue());
}
Also used : Diagnosis(com.microsoft.dhalion.diagnoser.Diagnosis) Symptom(com.microsoft.dhalion.detector.Symptom) Test(org.junit.Test)

Example 8 with Diagnosis

use of com.microsoft.dhalion.diagnoser.Diagnosis in project incubator-heron by apache.

the class UnderProvisioningDiagnoserTest method diagnosisFailsNotSimilarProcessingRates.

@Test
public void diagnosisFailsNotSimilarProcessingRates() {
    List<Symptom> symptoms = TestUtils.createBpSymptomList(123, 0, 0);
    symptoms.add(TestUtils.createExeCountSymptom(100, 500, 500));
    Diagnosis result = new UnderProvisioningDiagnoser().diagnose(symptoms);
    assertNull(result);
}
Also used : Diagnosis(com.microsoft.dhalion.diagnoser.Diagnosis) Symptom(com.microsoft.dhalion.detector.Symptom) Test(org.junit.Test)

Example 9 with Diagnosis

use of com.microsoft.dhalion.diagnoser.Diagnosis in project incubator-heron by apache.

the class SlowInstanceDiagnoser method diagnose.

@Override
public Diagnosis diagnose(List<Symptom> symptoms) {
    List<Symptom> bpSymptoms = getBackPressureSymptoms(symptoms);
    Map<String, ComponentMetrics> processingRateSkewComponents = getProcessingRateSkewComponents(symptoms);
    Map<String, ComponentMetrics> waitQDisparityComponents = getWaitQDisparityComponents(symptoms);
    if (bpSymptoms.isEmpty() || waitQDisparityComponents.isEmpty() || !processingRateSkewComponents.isEmpty()) {
        // execution count, no action is needed
        return null;
    } else if (bpSymptoms.size() > 1) {
        // TODO handle cases where multiple detectors create back pressure symptom
        throw new IllegalStateException("Multiple back-pressure symptoms case");
    }
    ComponentMetrics bpMetrics = bpSymptoms.iterator().next().getComponent();
    // verify wait Q disparity and back pressure for the same component exists
    ComponentMetrics pendingBufferMetrics = waitQDisparityComponents.get(bpMetrics.getName());
    if (pendingBufferMetrics == null) {
        // no wait Q disparity for the component with back pressure. There is no slow instance
        return null;
    }
    ComponentMetrics mergedData = ComponentMetrics.merge(bpMetrics, pendingBufferMetrics);
    ComponentMetricsHelper compStats = new ComponentMetricsHelper(mergedData);
    compStats.computeBpStats();
    MetricsStats bufferStats = compStats.computeMinMaxStats(METRIC_BUFFER_SIZE);
    Symptom resultSymptom = null;
    for (InstanceMetrics boltMetrics : compStats.getBoltsWithBackpressure()) {
        double bufferSize = boltMetrics.getMetricValueSum(METRIC_BUFFER_SIZE.text());
        double bpValue = boltMetrics.getMetricValueSum(METRIC_BACK_PRESSURE.text());
        if (bufferStats.getMetricMax() < bufferSize * 2) {
            LOG.info(String.format("SLOW: %s back-pressure(%s) and high buffer size: %s " + "and similar processing rates", boltMetrics.getName(), bpValue, bufferSize));
            resultSymptom = new Symptom(SYMPTOM_SLOW_INSTANCE.text(), mergedData);
        }
    }
    return resultSymptom != null ? new Diagnosis(DIAGNOSIS_SLOW_INSTANCE.text(), resultSymptom) : null;
}
Also used : InstanceMetrics(com.microsoft.dhalion.metrics.InstanceMetrics) ComponentMetricsHelper(com.twitter.heron.healthmgr.common.ComponentMetricsHelper) Diagnosis(com.microsoft.dhalion.diagnoser.Diagnosis) Symptom(com.microsoft.dhalion.detector.Symptom) ComponentMetrics(com.microsoft.dhalion.metrics.ComponentMetrics) MetricsStats(com.twitter.heron.healthmgr.common.MetricsStats)

Example 10 with Diagnosis

use of com.microsoft.dhalion.diagnoser.Diagnosis in project incubator-heron by apache.

the class RestartContainerResolver method resolve.

@Override
public List<Action> resolve(List<Diagnosis> diagnosis) {
    List<Action> actions = new ArrayList<>();
    for (Diagnosis diagnoses : diagnosis) {
        Symptom bpSymptom = diagnoses.getSymptoms().get(SYMPTOM_SLOW_INSTANCE.text());
        if (bpSymptom == null || bpSymptom.getComponents().isEmpty()) {
            // nothing to fix as there is no back pressure
            continue;
        }
        if (bpSymptom.getComponents().size() > 1) {
            throw new UnsupportedOperationException("Multiple components with back pressure symptom");
        }
        // want to know which stmgr has backpressure
        String stmgrId = null;
        for (InstanceMetrics im : bpSymptom.getComponent().getMetrics().values()) {
            if (im.hasMetricAboveLimit(METRIC_BACK_PRESSURE.text(), noiseFilterMillis)) {
                String instanceId = im.getName();
                int fromIndex = instanceId.indexOf('_') + 1;
                int toIndex = instanceId.indexOf('_', fromIndex);
                stmgrId = instanceId.substring(fromIndex, toIndex);
                break;
            }
        }
        LOG.info("Restarting container: " + stmgrId);
        boolean b = schedulerClient.restartTopology(RestartTopologyRequest.newBuilder().setContainerIndex(Integer.valueOf(stmgrId)).setTopologyName(topologyName).build());
        LOG.info("Restarted container result: " + b);
        ContainerRestart action = new ContainerRestart();
        LOG.info("Broadcasting container restart event");
        eventManager.onEvent(action);
        actions.add(action);
        return actions;
    }
    return actions;
}
Also used : InstanceMetrics(com.microsoft.dhalion.metrics.InstanceMetrics) Action(com.microsoft.dhalion.resolver.Action) ArrayList(java.util.ArrayList) Diagnosis(com.microsoft.dhalion.diagnoser.Diagnosis) Symptom(com.microsoft.dhalion.detector.Symptom) ContainerRestart(com.twitter.heron.healthmgr.common.HealthManagerEvents.ContainerRestart)

Aggregations

Diagnosis (com.microsoft.dhalion.diagnoser.Diagnosis)16 Symptom (com.microsoft.dhalion.detector.Symptom)15 Test (org.junit.Test)11 ComponentMetrics (com.microsoft.dhalion.metrics.ComponentMetrics)6 InstanceMetrics (com.microsoft.dhalion.metrics.InstanceMetrics)3 Action (com.microsoft.dhalion.resolver.Action)3 ComponentMetricsHelper (com.twitter.heron.healthmgr.common.ComponentMetricsHelper)3 ArrayList (java.util.ArrayList)3 MetricsStats (com.twitter.heron.healthmgr.common.MetricsStats)2 PackingPlan (com.twitter.heron.spi.packing.PackingPlan)2 HashMap (java.util.HashMap)2 TopologyAPI (com.twitter.heron.api.generated.TopologyAPI)1 ContainerRestart (com.twitter.heron.healthmgr.common.HealthManagerEvents.ContainerRestart)1 TopologyUpdate (com.twitter.heron.healthmgr.common.HealthManagerEvents.TopologyUpdate)1 PackingPlanProvider (com.twitter.heron.healthmgr.common.PackingPlanProvider)1 Scheduler (com.twitter.heron.proto.scheduler.Scheduler)1 UpdateTopologyRequest (com.twitter.heron.proto.scheduler.Scheduler.UpdateTopologyRequest)1 ISchedulerClient (com.twitter.heron.scheduler.client.ISchedulerClient)1 Config (com.twitter.heron.spi.common.Config)1