use of com.microsoft.dhalion.detector.Symptom in project incubator-heron by apache.
the class LargeWaitQueueDetector method detect.
/**
* Detects all components unable to keep up with input load, hence having a large pending buffer
* or wait queue
*
* @return A collection of all components executing slower than input rate.
*/
@Override
public List<Symptom> detect() {
ArrayList<Symptom> result = new ArrayList<>();
Map<String, ComponentMetrics> bufferSizes = pendingBufferSensor.get();
for (ComponentMetrics compMetrics : bufferSizes.values()) {
ComponentMetricsHelper compStats = new ComponentMetricsHelper(compMetrics);
MetricsStats stats = compStats.computeMinMaxStats(METRIC_BUFFER_SIZE.text());
if (stats.getMetricMin() > sizeLimit) {
LOG.info(String.format("Detected large wait queues for %s, smallest queue is %f", compMetrics.getName(), stats.getMetricMin()));
result.add(new Symptom(SYMPTOM_LARGE_WAIT_Q.text(), compMetrics));
}
}
return result;
}
use of com.microsoft.dhalion.detector.Symptom in project incubator-heron by apache.
the class BackPressureDetector method detect.
/**
* Detects all components initiating backpressure above the configured limit. Normally there
* will be only one component
*
* @return A collection of all components causing backpressure.
*/
@Override
public List<Symptom> detect() {
ArrayList<Symptom> result = new ArrayList<>();
Map<String, ComponentMetrics> backpressureMetrics = bpSensor.get();
for (ComponentMetrics compMetrics : backpressureMetrics.values()) {
ComponentMetricsHelper compStats = new ComponentMetricsHelper(compMetrics);
compStats.computeBpStats();
if (compStats.getTotalBackpressure() > noiseFilterMillis) {
LOG.info(String.format("Detected back pressure for %s, total back pressure is %f", compMetrics.getName(), compStats.getTotalBackpressure()));
result.add(new Symptom(SYMPTOM_BACK_PRESSURE.text(), compMetrics));
}
}
return result;
}
use of com.microsoft.dhalion.detector.Symptom in project incubator-heron by apache.
the class DataSkewDiagnoser method diagnose.
@Override
public Diagnosis diagnose(List<Symptom> symptoms) {
List<Symptom> bpSymptoms = getBackPressureSymptoms(symptoms);
Map<String, ComponentMetrics> processingRateSkewComponents = getProcessingRateSkewComponents(symptoms);
Map<String, ComponentMetrics> waitQDisparityComponents = getWaitQDisparityComponents(symptoms);
if (bpSymptoms.isEmpty() || processingRateSkewComponents.isEmpty() || waitQDisparityComponents.isEmpty()) {
// Since there is no back pressure or disparate execute count, no action is needed
return null;
} else if (bpSymptoms.size() > 1) {
// TODO handle cases where multiple detectors create back pressure symptom
throw new IllegalStateException("Multiple back-pressure symptoms case");
}
ComponentMetrics bpMetrics = bpSymptoms.iterator().next().getComponent();
// verify data skew, larger queue size and back pressure for the same component exists
ComponentMetrics exeCountMetrics = processingRateSkewComponents.get(bpMetrics.getName());
ComponentMetrics pendingBufferMetrics = waitQDisparityComponents.get(bpMetrics.getName());
if (exeCountMetrics == null || pendingBufferMetrics == null) {
// for the component with back pressure. This is not a data skew case
return null;
}
ComponentMetrics mergedData = ComponentMetrics.merge(bpMetrics, ComponentMetrics.merge(exeCountMetrics, pendingBufferMetrics));
ComponentMetricsHelper compStats = new ComponentMetricsHelper(mergedData);
compStats.computeBpStats();
MetricsStats exeStats = compStats.computeMinMaxStats(METRIC_EXE_COUNT);
MetricsStats bufferStats = compStats.computeMinMaxStats(METRIC_BUFFER_SIZE);
Symptom resultSymptom = null;
for (InstanceMetrics boltMetrics : compStats.getBoltsWithBackpressure()) {
double exeCount = boltMetrics.getMetricValueSum(METRIC_EXE_COUNT.text());
double bufferSize = boltMetrics.getMetricValueSum(METRIC_BUFFER_SIZE.text());
double bpValue = boltMetrics.getMetricValueSum(METRIC_BACK_PRESSURE.text());
if (exeStats.getMetricMax() < 1.10 * exeCount && bufferStats.getMetricMax() < 2 * bufferSize) {
LOG.info(String.format("DataSkew: %s back-pressure(%s), high execution count: %s and " + "high buffer size %s", boltMetrics.getName(), bpValue, exeCount, bufferSize));
resultSymptom = new Symptom(SYMPTOM_DATA_SKEW.text(), mergedData);
}
}
return resultSymptom != null ? new Diagnosis(DIAGNOSIS_DATA_SKEW.text(), resultSymptom) : null;
}
use of com.microsoft.dhalion.detector.Symptom in project incubator-heron by apache.
the class UnderProvisioningDiagnoser method diagnose.
@Override
public Diagnosis diagnose(List<Symptom> symptoms) {
List<Symptom> bpSymptoms = getBackPressureSymptoms(symptoms);
Map<String, ComponentMetrics> processingRateSkewComponents = getProcessingRateSkewComponents(symptoms);
Map<String, ComponentMetrics> waitQDisparityComponents = getWaitQDisparityComponents(symptoms);
if (bpSymptoms.isEmpty() || !processingRateSkewComponents.isEmpty() || !waitQDisparityComponents.isEmpty()) {
// and buffer sizes, no action is needed
return null;
} else if (bpSymptoms.size() > 1) {
// TODO handle cases where multiple detectors create back pressure symptom
throw new IllegalStateException("Multiple back-pressure symptoms case");
}
ComponentMetrics bpMetrics = bpSymptoms.iterator().next().getComponent();
ComponentMetricsHelper compStats = new ComponentMetricsHelper(bpMetrics);
compStats.computeBpStats();
LOG.info(String.format("UNDER_PROVISIONING: %s back-pressure(%s) and similar processing rates " + "and buffer sizes", bpMetrics.getName(), compStats.getTotalBackpressure()));
Symptom resultSymptom = new Symptom(SYMPTOM_UNDER_PROVISIONING.text(), bpMetrics);
return new Diagnosis(DIAGNOSIS_UNDER_PROVISIONING.text(), resultSymptom);
}
use of com.microsoft.dhalion.detector.Symptom in project incubator-heron by apache.
the class ScaleUpResolver method resolve.
@Override
public List<Action> resolve(List<Diagnosis> diagnosis) {
for (Diagnosis diagnoses : diagnosis) {
Symptom bpSymptom = diagnoses.getSymptoms().get(SYMPTOM_UNDER_PROVISIONING.text());
if (bpSymptom == null || bpSymptom.getComponents().isEmpty()) {
// nothing to fix as there is no back pressure
continue;
}
if (bpSymptom.getComponents().size() > 1) {
throw new UnsupportedOperationException("Multiple components with back pressure symptom");
}
ComponentMetrics bpComponent = bpSymptom.getComponent();
int newParallelism = computeScaleUpFactor(bpComponent);
Map<String, Integer> changeRequest = new HashMap<>();
changeRequest.put(bpComponent.getName(), newParallelism);
PackingPlan currentPackingPlan = packingPlanProvider.get();
PackingPlan newPlan = buildNewPackingPlan(changeRequest, currentPackingPlan);
if (newPlan == null) {
return null;
}
Scheduler.UpdateTopologyRequest updateTopologyRequest = Scheduler.UpdateTopologyRequest.newBuilder().setCurrentPackingPlan(getSerializedPlan(currentPackingPlan)).setProposedPackingPlan(getSerializedPlan(newPlan)).build();
LOG.info("Sending Updating topology request: " + updateTopologyRequest);
if (!schedulerClient.updateTopology(updateTopologyRequest)) {
throw new RuntimeException(String.format("Failed to update topology with Scheduler, " + "updateTopologyRequest=%s", updateTopologyRequest));
}
TopologyUpdate action = new TopologyUpdate();
LOG.info("Broadcasting topology update event");
eventManager.onEvent(action);
LOG.info("Scheduler updated topology successfully.");
List<Action> actions = new ArrayList<>();
actions.add(action);
return actions;
}
return null;
}
Aggregations