use of com.sequenceiq.periscope.monitor.event.UpdateFailedEvent in project cloudbreak by hortonworks.
the class AmbariAgentHealthEvaluator method run.
@Override
public void run() {
Cluster cluster = clusterService.find(clusterId);
MDCBuilder.buildMdcContext(cluster);
LOGGER.info("Checking '{}' alerts.", AMBARI_AGENT_HEARTBEAT);
try {
AmbariClient ambariClient = ambariClientProvider.createAmbariClient(cluster);
List<Map<String, Object>> alertHistory = ambariClient.getAlert(AMBARI_AGENT_HEARTBEAT_DEF_NAME);
if (!alertHistory.isEmpty()) {
List<String> hostNamesToRecover = new ArrayList<>();
for (Map<String, Object> history : alertHistory) {
String currentState = (String) history.get(ALERT_STATE);
if (isAlertStateMet(currentState)) {
String hostName = (String) history.get(HOST_NAME);
hostNamesToRecover.add(hostName);
LOGGER.info("Alert: {} is in '{}' state for host '{}'.", AMBARI_AGENT_HEARTBEAT, currentState, hostName);
}
}
if (!hostNamesToRecover.isEmpty()) {
hostNamesToRecover.forEach(hn -> LOGGER.info("Host to recover: {}", hn));
CloudbreakClient cbClient = cloudbreakClientConfiguration.cloudbreakClient();
FailureReport failureReport = new FailureReport();
failureReport.setFailedNodes(hostNamesToRecover);
cbClient.clusterEndpoint().failureReport(cluster.getStackId(), failureReport);
}
}
} catch (Exception e) {
LOGGER.warn(String.format("Failed to retrieve '%s' alerts.", AMBARI_AGENT_HEARTBEAT), e);
publishEvent(new UpdateFailedEvent(clusterId));
}
}
use of com.sequenceiq.periscope.monitor.event.UpdateFailedEvent in project cloudbreak by hortonworks.
the class MetricEvaluator method run.
@Override
public void run() {
Cluster cluster = clusterService.find(clusterId);
MDCBuilder.buildMdcContext(cluster);
AmbariClient ambariClient = ambariClientProvider.createAmbariClient(cluster);
try {
for (MetricAlert alert : alertRepository.findAllByCluster(clusterId)) {
String alertName = alert.getName();
LOGGER.info("Checking metric based alert: '{}'", alertName);
List<Map<String, Object>> alertHistory = ambariClient.getAlertHistory(alert.getDefinitionName(), 1);
int historySize = alertHistory.size();
if (historySize > 1) {
LOGGER.debug("Multiple results found for alert: {}, probably HOST alert, ignoring now..", alertName);
continue;
}
if (!alertHistory.isEmpty()) {
Map<String, Object> history = alertHistory.get(0);
String currentState = (String) history.get(ALERT_STATE);
if (isAlertStateMet(currentState, alert)) {
long elapsedTime = getPeriod(history);
LOGGER.info("Alert: {} is in '{}' state since {} min(s)", alertName, currentState, ClusterUtils.TIME_FORMAT.format((double) elapsedTime / ClusterUtils.MIN_IN_MS));
if (isPeriodReached(alert, elapsedTime) && isPolicyAttached(alert)) {
publishEvent(new ScalingEvent(alert));
break;
}
}
}
}
} catch (Exception e) {
LOGGER.error("Failed to retrieve alert history", e);
publishEvent(new UpdateFailedEvent(clusterId));
}
}
use of com.sequenceiq.periscope.monitor.event.UpdateFailedEvent in project cloudbreak by hortonworks.
the class PrometheusEvaluator method run.
@Override
public void run() {
try {
Cluster cluster = clusterService.find(clusterId);
MDCBuilder.buildMdcContext(cluster);
TlsConfiguration tlsConfig = tlsSecurityService.getConfiguration(cluster);
Client client = RestClientUtil.createClient(tlsConfig.getServerCert(), tlsConfig.getClientCert(), tlsConfig.getClientKey(), true, PrometheusEvaluator.class);
String prometheusAddress = String.format("https://%s:%s/prometheus", cluster.getAmbari().getHost(), cluster.getPort());
WebTarget target = client.target(prometheusAddress);
for (PrometheusAlert alert : alertRepository.findAllByCluster(clusterId)) {
String alertName = alert.getName();
LOGGER.info("Checking Prometheus based alert: '{}'", alertName);
String query = URLEncoder.encode(String.format("ALERTS{alertname=\"%s\"}[%dm]", alert.getName(), alert.getPeriod()), "UTF-8");
Response response = target.path("/api/v1/query").queryParam("query", query).request().header("Accept", MediaType.APPLICATION_JSON_VALUE).get();
PrometheusResponse prometheusResponse = JaxRSUtil.response(response, PrometheusResponse.class);
boolean triggerScale = false;
switch(alert.getAlertState()) {
case OK:
triggerScale = prometheusResponse.getData().getResult().isEmpty();
break;
case CRITICAL:
for (Result alertResult : prometheusResponse.getData().getResult()) {
if ("firing".equals(alertResult.getMetric().getAlertstate())) {
List<Object> lastSample = alertResult.getValues().get(alertResult.getValues().size() - 1);
Object alertValue = lastSample.get(1);
if (alertValue instanceof String) {
if ("0".equals(alertValue)) {
break;
}
triggerScale = true;
}
}
}
break;
default:
triggerScale = false;
break;
}
if (triggerScale && isPolicyAttached(alert)) {
publishEvent(new ScalingEvent(alert));
}
}
} catch (Exception e) {
LOGGER.error("Failed to retrieve alerts from Prometheus", e);
publishEvent(new UpdateFailedEvent(clusterId));
}
}
Aggregations