use of com.alibaba.otter.shared.communication.model.arbitrate.NodeAlarmEvent in project otter by alibaba.
the class LogRecordServiceImpl method create.
public void create(Event event) {
LogRecord logRecord = new LogRecord();
if (event instanceof NodeAlarmEvent) {
NodeAlarmEvent nodeAlarmEvent = (NodeAlarmEvent) event;
Pipeline tempPipeline = new Pipeline();
tempPipeline.setId(nodeAlarmEvent.getPipelineId());
logRecord.setPipeline(tempPipeline);
logRecord.setNid(nodeAlarmEvent.getNid());
logRecord.setTitle(nodeAlarmEvent.getTitle());
logRecord.setMessage(nodeAlarmEvent.getMessage());
}
create(logRecord);
}
use of com.alibaba.otter.shared.communication.model.arbitrate.NodeAlarmEvent in project otter by alibaba.
the class ExceptionRuleMonitor method feed.
@Override
public void feed(Object data, Long pipelineId) {
if (!(data instanceof NodeAlarmEvent)) {
return;
}
NodeAlarmEvent alarmEvent = (NodeAlarmEvent) data;
// 异常一定需要记录日志
String message = String.format(MESAGE_FORMAT, alarmEvent.getPipelineId(), alarmEvent.getNid(), alarmEvent.getMessage());
logRecordAlarm(pipelineId, alarmEvent.getNid(), MonitorName.EXCEPTION, message);
// 报警检查
List<AlarmRule> rules = alarmRuleService.getAlarmRules(pipelineId, AlarmRuleStatus.ENABLE);
// TODO 需要给 alarmRuleService 提需求
Date now = new Date();
List<AlarmRule> exceptionRules = new ArrayList<AlarmRule>();
for (AlarmRule rule : rules) {
if (MonitorName.EXCEPTION.equals(rule.getMonitorName()) && checkEnable(rule, now)) {
exceptionRules.add(rule);
}
}
if (CollectionUtils.isEmpty(exceptionRules)) {
return;
}
for (AlarmRule rule : exceptionRules) {
check(rule, alarmEvent);
}
}
use of com.alibaba.otter.shared.communication.model.arbitrate.NodeAlarmEvent in project otter by alibaba.
the class RestartAlarmRecovery method processRecovery.
private boolean processRecovery(Long channelId, Long ruleId, boolean needStop) {
boolean result = true;
if (!needStop) {
result = arbitrateManageService.channelEvent().restart(channelId);
if (result) {
// 推送一下配置
channelService.notifyChannel(channelId);
}
} else {
// 解决process rpc模式下释放不完整,通过stop完整释放一次所有对象资源
channelService.stopChannel(channelId);
channelService.startChannel(channelId);
}
NodeAlarmEvent alarm = new NodeAlarmEvent();
alarm.setPipelineId(-1L);
alarm.setTitle(MonitorName.EXCEPTION.name());
if (result) {
if (!needStop) {
alarm.setMessage(String.format("cid:%s restart recovery successful for rid:%s", String.valueOf(channelId), String.valueOf(ruleId)));
} else {
alarm.setMessage(String.format("cid:%s stop recovery successful for rid:%s", String.valueOf(channelId), String.valueOf(ruleId)));
}
try {
exceptionRuleMonitor.feed(alarm, alarm.getPipelineId());
} catch (Exception e) {
logger.error(String.format("ERROR # exceptionRuleMonitor error for %s", alarm.toString()), e);
}
}
return result;
}
use of com.alibaba.otter.shared.communication.model.arbitrate.NodeAlarmEvent in project otter by alibaba.
the class ExceptionRuleMonitorTest method testSerialProcess.
@Test
public void testSerialProcess() {
new NonStrictExpectations() {
{
alarmRuleService.getAlarmRules(anyLong, AlarmRuleStatus.ENABLE);
List<AlarmRule> rules = new ArrayList<AlarmRule>();
AlarmRule rule = new AlarmRule();
rule.setDescription("xxx");
rule.setGmtCreate(new Date());
rule.setGmtModified(new Date());
rule.setId(1L);
rule.setMatchValue("EXCEPTION");
rule.setMonitorName(MonitorName.EXCEPTION);
rule.setPipelineId(2L);
rule.setReceiverKey("otterteam");
rule.setStatus(AlarmRuleStatus.ENABLE);
rules.add(rule);
returns(rules);
}
};
NodeAlarmEvent event = new NodeAlarmEvent();
event.setMessage("pid:77 nid:5 exception:EXCEPTON,nid:5[setl:ERROR ## SelectTask processId = 644408,parallelism = 5,ProcessEnd processId = 644394 invalid]");
event.setNid(5L);
event.setPipelineId(2L);
event.setTitle("EXCEPTON");
monitor.feed(event, 2L);
}
use of com.alibaba.otter.shared.communication.model.arbitrate.NodeAlarmEvent in project otter by alibaba.
the class DeadNodeListener method processDead.
private void processDead(Long deadNode) {
List<Long> aliveNodes = nodeMonitor.getAliveNodes(true);
// 需要考虑一种网络瞬断的情况,会导致node所有出现重连,导致出现restart风暴,执行restart时需要重新check下是否存活
if (aliveNodes.contains(deadNode)) {
logger.warn("dead node[{}] happend just one moment , check it's alived", deadNode);
return;
}
// 发送一条报警信息
List<Long> channelIds = Lists.newArrayList();
List<Channel> channels = channelService.listByNodeId(deadNode, ChannelStatus.START);
for (Channel channel : channels) {
channelIds.add(channel.getId());
}
Collections.sort(channelIds);
NodeAlarmEvent alarm = new NodeAlarmEvent();
alarm.setPipelineId(-1L);
alarm.setTitle(MonitorName.EXCEPTION.name());
alarm.setMessage(String.format("nid:%s is dead and restart cids:%s", String.valueOf(deadNode), channelIds.toString()));
try {
exceptionRuleMonitor.feed(alarm, alarm.getPipelineId());
} catch (Exception e) {
logger.error(String.format("ERROR # exceptionRuleMonitor error for %s", alarm.toString()), e);
}
for (Long channelId : channelIds) {
// 重启一下对应的channel
boolean result = arbitrateManageService.channelEvent().restart(channelId);
if (result) {
// 推送一下配置
channelService.notifyChannel(channelId);
}
}
}
Aggregations