use of com.microsoft.dhalion.core.MeasurementsTable in project heron by twitter.
the class GrowingWaitQueueDetector method detect.
/**
* Detects all components unable to keep up with input load, hence having a growing pending buffer
* or wait queue
*
* @return A collection of symptoms each one corresponding to a components executing slower
* than input rate.
*/
@Override
public Collection<Symptom> detect(Collection<Measurement> measurements) {
Collection<Symptom> result = new ArrayList<>();
MeasurementsTable waitQueueMetrics = MeasurementsTable.of(measurements).type(METRIC_WAIT_Q_SIZE.text());
for (String component : waitQueueMetrics.uniqueComponents()) {
double maxSlope = computeWaitQueueSizeTrend(waitQueueMetrics.component(component));
if (maxSlope > rateLimit) {
LOG.info(String.format("Detected growing wait queues for %s, max rate %f", component, maxSlope));
Collection<String> addresses = Collections.singletonList(component);
result.add(new Symptom(SYMPTOM_GROWING_WAIT_Q.text(), context.checkpoint(), addresses));
}
}
return result;
}
use of com.microsoft.dhalion.core.MeasurementsTable in project heron by twitter.
the class DataSkewDiagnoser method diagnose.
@Override
public Collection<Diagnosis> diagnose(Collection<Symptom> symptoms) {
Collection<Diagnosis> diagnoses = new ArrayList<>();
SymptomsTable symptomsTable = SymptomsTable.of(symptoms);
SymptomsTable bp = symptomsTable.type(SYMPTOM_COMP_BACK_PRESSURE.text());
if (bp.size() > 1) {
// TODO handle cases where multiple detectors create back pressure symptom
throw new IllegalStateException("Multiple back-pressure symptoms case");
}
if (bp.size() == 0) {
return diagnoses;
}
String bpComponent = bp.first().assignments().iterator().next();
SymptomsTable processingRateSkew = symptomsTable.type(SYMPTOM_PROCESSING_RATE_SKEW.text());
SymptomsTable waitQSkew = symptomsTable.type(SYMPTOM_WAIT_Q_SIZE_SKEW.text());
// verify data skew, larger queue size and back pressure for the same component exists
if (waitQSkew.assignment(bpComponent).size() == 0 || processingRateSkew.assignment(bpComponent).size() == 0) {
return diagnoses;
}
Collection<String> assignments = new ArrayList<>();
Instant newest = context.checkpoint();
Instant oldest = context.previousCheckpoint();
MeasurementsTable measurements = context.measurements().between(oldest, newest).component(bpComponent);
for (String instance : measurements.uniqueInstances()) {
MeasurementsTable instanceMeasurements = measurements.instance(instance);
double waitQSize = instanceMeasurements.type(METRIC_WAIT_Q_SIZE.text()).mean();
double processingRate = instanceMeasurements.type(METRIC_EXE_COUNT.text()).mean();
if ((measurements.type(METRIC_WAIT_Q_SIZE.text()).max() < waitQSize * 2) && (measurements.type(METRIC_EXE_COUNT.text()).max() < 1.10 * processingRate)) {
assignments.add(instance);
LOG.info(String.format("DataSkew: %s back-pressure, high execution count: %s and " + "high buffer size %s", instance, processingRate, waitQSize));
}
}
if (assignments.size() > 0) {
diagnoses.add(new Diagnosis(DIAGNOSIS_DATA_SKEW.text(), context.checkpoint(), assignments));
}
return diagnoses;
}
use of com.microsoft.dhalion.core.MeasurementsTable in project heron by twitter.
the class LargeWaitQueueDetector method detect.
/**
* Detects all components having a large pending buffer or wait queue
*
* @return A collection of symptoms each one corresponding to components with
* large wait queues.
*/
@Override
public Collection<Symptom> detect(Collection<Measurement> measurements) {
Collection<Symptom> result = new ArrayList<>();
MeasurementsTable waitQueueMetrics = MeasurementsTable.of(measurements).type(METRIC_WAIT_Q_SIZE.text());
for (String component : waitQueueMetrics.uniqueComponents()) {
Set<String> addresses = new HashSet<>();
MeasurementsTable instanceMetrics = waitQueueMetrics.component(component);
for (String instance : instanceMetrics.uniqueInstances()) {
double avgWaitQSize = instanceMetrics.instance(instance).mean();
if (avgWaitQSize > sizeLimit) {
LOG.info(String.format("Detected large wait queues for instance" + "%s, smallest queue is + %f", instance, avgWaitQSize));
addresses.add(instance);
}
}
if (addresses.size() > 0) {
result.add(new Symptom(SYMPTOM_LARGE_WAIT_Q.text(), context.checkpoint(), addresses));
}
}
return result;
}
use of com.microsoft.dhalion.core.MeasurementsTable in project heron by twitter.
the class SlowInstanceDiagnoser method diagnose.
@Override
public Collection<Diagnosis> diagnose(Collection<Symptom> symptoms) {
publishingMetrics.executeDiagnoserIncr(SLOW_INSTANCE_DIAGNOSER);
Collection<Diagnosis> diagnoses = new ArrayList<>();
SymptomsTable symptomsTable = SymptomsTable.of(symptoms);
SymptomsTable bp = symptomsTable.type(SYMPTOM_COMP_BACK_PRESSURE.text());
if (bp.size() > 1) {
// TODO handle cases where multiple detectors create back pressure symptom
throw new IllegalStateException("Multiple back-pressure symptoms case");
}
if (bp.size() == 0) {
return diagnoses;
}
String bpComponent = bp.first().assignments().iterator().next();
SymptomsTable processingRateSkew = symptomsTable.type(SYMPTOM_PROCESSING_RATE_SKEW.text());
SymptomsTable waitQSkew = symptomsTable.type(SYMPTOM_WAIT_Q_SIZE_SKEW.text());
// exist
if (waitQSkew.assignment(bpComponent).size() == 0 || processingRateSkew.assignment(bpComponent).size() > 0) {
// TODO in a short window rate skew could exist
return diagnoses;
}
Collection<String> assignments = new ArrayList<>();
Instant newest = context.checkpoint();
Instant oldest = context.previousCheckpoint();
MeasurementsTable measurements = context.measurements().between(oldest, newest).component(bpComponent);
for (String instance : measurements.uniqueInstances()) {
MeasurementsTable instanceMeasurements = measurements.instance(instance);
double waitQSize = instanceMeasurements.type(METRIC_WAIT_Q_SIZE.text()).mean();
if (measurements.type(METRIC_WAIT_Q_SIZE.text()).max() < waitQSize * 2) {
assignments.add(instance);
LOG.info(String.format("SLOW: %s back-pressure and high buffer size: %s " + "and similar processing rates", instance, waitQSize));
}
}
if (assignments.size() > 0) {
Instant now = context.checkpoint();
diagnoses.add(new Diagnosis(DIAGNOSIS_SLOW_INSTANCE.text(), now, assignments));
}
return diagnoses;
}
use of com.microsoft.dhalion.core.MeasurementsTable in project heron by twitter.
the class BufferSizeSensor method fetch.
/**
* The buffer size as provided by tracker
*
* @return buffer size measurements
*/
@Override
public Collection<Measurement> fetch() {
Collection<Measurement> result = new ArrayList<>();
Instant now = context.checkpoint();
List<String> boltComponents = physicalPlanProvider.getBoltNames();
Duration duration = getDuration();
for (String component : boltComponents) {
String[] boltInstanceNames = packingPlanProvider.getBoltInstanceNames(component);
for (String instance : boltInstanceNames) {
String metric = getMetricName() + instance + MetricName.METRIC_WAIT_Q_SIZE_SUFFIX;
Collection<Measurement> stmgrResult = metricsProvider.getMeasurements(now, duration, metric, COMPONENT_STMGR);
if (stmgrResult.isEmpty()) {
continue;
}
MeasurementsTable table = MeasurementsTable.of(stmgrResult).component(COMPONENT_STMGR);
if (table.size() == 0) {
continue;
}
double totalSize = table.type(metric).sum();
Measurement measurement = new Measurement(component, instance, getMetricName(), now, totalSize);
result.add(measurement);
}
}
return result;
}
Aggregations