Search in sources :

Example 1 with TaskHeartbeat

use of backtype.storm.generated.TaskHeartbeat in project jstorm by alibaba.

the class TaskHeartbeatUpdater method process.

public void process(Tuple input) {
    int sourceTask = input.getSourceTask();
    int uptime = (Integer) input.getValue(0);
    // Update the heartbeat for source task
    TaskHeartbeat taskHb = taskHbMap.get(sourceTask);
    if (taskHb == null) {
        taskHb = new TaskHeartbeat(TimeUtils.current_time_secs(), uptime);
        taskHbMap.put(sourceTask, taskHb);
    } else {
        taskHb.set_time(TimeUtils.current_time_secs());
        taskHb.set_uptime(uptime);
    }
    // Send heartbeat info of all tasks to nimbus
    if (sourceTask == taskId) {
        // Send heartbeat info of MAX_NUM_TASK_HB_SEND tasks each time
        TopologyTaskHbInfo tmpTaskHbInfo = new TopologyTaskHbInfo(topologyId, taskId);
        Map<Integer, TaskHeartbeat> tmpTaskHbMap = new ConcurrentHashMap<Integer, TaskHeartbeat>();
        tmpTaskHbInfo.set_taskHbs(tmpTaskHbMap);
        int sendCount = 0;
        for (Entry<Integer, TaskHeartbeat> entry : taskHbMap.entrySet()) {
            tmpTaskHbMap.put(entry.getKey(), entry.getValue());
            sendCount++;
            if (sendCount >= MAX_NUM_TASK_HB_SEND) {
                setTaskHeatbeat(tmpTaskHbInfo);
                tmpTaskHbMap.clear();
                sendCount = 0;
            }
        }
        if (tmpTaskHbMap.size() > 0) {
            setTaskHeatbeat(tmpTaskHbInfo);
        }
    }
}
Also used : TaskHeartbeat(backtype.storm.generated.TaskHeartbeat) TopologyTaskHbInfo(backtype.storm.generated.TopologyTaskHbInfo) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap)

Example 2 with TaskHeartbeat

use of backtype.storm.generated.TaskHeartbeat in project jstorm by alibaba.

the class ServiceHandler method getTopologyInfo.

/**
 * Get TopologyInfo, it contain all topology running data
 *
 * @return TopologyInfo
 */
@Override
public TopologyInfo getTopologyInfo(String topologyId) throws TException {
    long start = System.nanoTime();
    StormClusterState stormClusterState = data.getStormClusterState();
    try {
        // get topology's StormBase
        StormBase base = stormClusterState.storm_base(topologyId, null);
        if (base == null) {
            throw new NotAliveException("No topology of " + topologyId);
        }
        Assignment assignment = stormClusterState.assignment_info(topologyId, null);
        if (assignment == null) {
            throw new NotAliveException("No topology of " + topologyId);
        }
        TopologyTaskHbInfo topologyTaskHbInfo = data.getTasksHeartbeat().get(topologyId);
        Map<Integer, TaskHeartbeat> taskHbMap = null;
        if (topologyTaskHbInfo != null)
            taskHbMap = topologyTaskHbInfo.get_taskHbs();
        Map<Integer, TaskInfo> taskInfoMap = Cluster.get_all_taskInfo(stormClusterState, topologyId);
        Map<Integer, String> taskToComponent = Cluster.get_all_task_component(stormClusterState, topologyId, taskInfoMap);
        Map<Integer, String> taskToType = Cluster.get_all_task_type(stormClusterState, topologyId, taskInfoMap);
        String errorString;
        if (Cluster.is_topology_exist_error(stormClusterState, topologyId)) {
            errorString = "Y";
        } else {
            errorString = "";
        }
        TopologySummary topologySummary = new TopologySummary();
        topologySummary.set_id(topologyId);
        topologySummary.set_name(base.getStormName());
        topologySummary.set_uptimeSecs(TimeUtils.time_delta(base.getLanchTimeSecs()));
        topologySummary.set_status(base.getStatusString());
        topologySummary.set_numTasks(NimbusUtils.getTopologyTaskNum(assignment));
        topologySummary.set_numWorkers(assignment.getWorkers().size());
        topologySummary.set_errorInfo(errorString);
        Map<String, ComponentSummary> componentSummaryMap = new HashMap<>();
        HashMap<String, List<Integer>> componentToTasks = JStormUtils.reverse_map(taskToComponent);
        for (Entry<String, List<Integer>> entry : componentToTasks.entrySet()) {
            String name = entry.getKey();
            List<Integer> taskIds = entry.getValue();
            if (taskIds == null || taskIds.size() == 0) {
                LOG.warn("No task of component " + name);
                continue;
            }
            ComponentSummary componentSummary = new ComponentSummary();
            componentSummaryMap.put(name, componentSummary);
            componentSummary.set_name(name);
            componentSummary.set_type(taskToType.get(taskIds.get(0)));
            componentSummary.set_parallel(taskIds.size());
            componentSummary.set_taskIds(taskIds);
        }
        Map<Integer, TaskSummary> taskSummaryMap = new TreeMap<>();
        Map<Integer, List<TaskError>> taskErrors = Cluster.get_all_task_errors(stormClusterState, topologyId);
        for (Integer taskId : taskInfoMap.keySet()) {
            TaskSummary taskSummary = new TaskSummary();
            taskSummaryMap.put(taskId, taskSummary);
            taskSummary.set_taskId(taskId);
            if (taskHbMap == null) {
                taskSummary.set_status("Starting");
                taskSummary.set_uptime(0);
            } else {
                TaskHeartbeat hb = taskHbMap.get(taskId);
                if (hb == null) {
                    taskSummary.set_status("Starting");
                    taskSummary.set_uptime(0);
                } else {
                    boolean isInactive = NimbusUtils.isTaskDead(data, topologyId, taskId);
                    if (isInactive)
                        taskSummary.set_status("INACTIVE");
                    else
                        taskSummary.set_status("ACTIVE");
                    taskSummary.set_uptime(hb.get_uptime());
                }
            }
            if (StringUtils.isBlank(errorString)) {
                continue;
            }
            List<TaskError> taskErrorList = taskErrors.get(taskId);
            if (taskErrorList != null && taskErrorList.size() != 0) {
                for (TaskError taskError : taskErrorList) {
                    ErrorInfo errorInfo = new ErrorInfo(taskError.getError(), taskError.getTimSecs(), taskError.getLevel(), taskError.getCode());
                    taskSummary.add_to_errors(errorInfo);
                    String component = taskToComponent.get(taskId);
                    componentSummaryMap.get(component).add_to_errors(errorInfo);
                }
            }
        }
        for (ResourceWorkerSlot workerSlot : assignment.getWorkers()) {
            String hostname = workerSlot.getHostname();
            int port = workerSlot.getPort();
            for (Integer taskId : workerSlot.getTasks()) {
                TaskSummary taskSummary = taskSummaryMap.get(taskId);
                taskSummary.set_host(hostname);
                taskSummary.set_port(port);
            }
        }
        TopologyInfo topologyInfo = new TopologyInfo();
        topologyInfo.set_topology(topologySummary);
        topologyInfo.set_components(JStormUtils.mk_list(componentSummaryMap.values()));
        topologyInfo.set_tasks(JStormUtils.mk_list(taskSummaryMap.values()));
        // return topology metric & component metric only
        List<MetricInfo> tpMetricList = data.getMetricCache().getMetricData(topologyId, MetaType.TOPOLOGY);
        List<MetricInfo> compMetricList = data.getMetricCache().getMetricData(topologyId, MetaType.COMPONENT);
        List<MetricInfo> workerMetricList = data.getMetricCache().getMetricData(topologyId, MetaType.WORKER);
        List<MetricInfo> compStreamMetricList = data.getMetricCache().getMetricData(topologyId, MetaType.COMPONENT_STREAM);
        MetricInfo taskMetric = MetricUtils.mkMetricInfo();
        MetricInfo streamMetric = MetricUtils.mkMetricInfo();
        MetricInfo nettyMetric = MetricUtils.mkMetricInfo();
        MetricInfo tpMetric, compMetric, compStreamMetric, workerMetric;
        if (tpMetricList == null || tpMetricList.size() == 0) {
            tpMetric = MetricUtils.mkMetricInfo();
        } else {
            // get the last min topology metric
            tpMetric = tpMetricList.get(tpMetricList.size() - 1);
        }
        if (compMetricList == null || compMetricList.size() == 0) {
            compMetric = MetricUtils.mkMetricInfo();
        } else {
            compMetric = compMetricList.get(0);
        }
        if (compStreamMetricList == null || compStreamMetricList.size() == 0) {
            compStreamMetric = MetricUtils.mkMetricInfo();
        } else {
            compStreamMetric = compStreamMetricList.get(0);
        }
        if (workerMetricList == null || workerMetricList.size() == 0) {
            workerMetric = MetricUtils.mkMetricInfo();
        } else {
            workerMetric = workerMetricList.get(0);
        }
        TopologyMetric topologyMetrics = new TopologyMetric(tpMetric, compMetric, workerMetric, taskMetric, streamMetric, nettyMetric);
        topologyMetrics.set_compStreamMetric(compStreamMetric);
        topologyInfo.set_metrics(topologyMetrics);
        return topologyInfo;
    } catch (TException e) {
        LOG.info("Failed to get topologyInfo " + topologyId, e);
        throw e;
    } catch (Exception e) {
        LOG.info("Failed to get topologyInfo " + topologyId, e);
        throw new TException("Failed to get topologyInfo" + topologyId);
    } finally {
        long end = System.nanoTime();
        SimpleJStormMetric.updateNimbusHistogram("getTopologyInfo", (end - start) / TimeUtils.NS_PER_US);
    }
}
Also used : TException(org.apache.thrift.TException) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashMap(java.util.HashMap) StormBase(com.alibaba.jstorm.cluster.StormBase) ComponentSummary(backtype.storm.generated.ComponentSummary) Assignment(com.alibaba.jstorm.schedule.Assignment) TaskInfo(com.alibaba.jstorm.task.TaskInfo) NotAliveException(backtype.storm.generated.NotAliveException) ArrayList(java.util.ArrayList) List(java.util.List) TopologySummary(backtype.storm.generated.TopologySummary) ResourceWorkerSlot(com.alibaba.jstorm.schedule.default_assign.ResourceWorkerSlot) TopologyTaskHbInfo(backtype.storm.generated.TopologyTaskHbInfo) ErrorInfo(backtype.storm.generated.ErrorInfo) TaskError(com.alibaba.jstorm.task.error.TaskError) TopologyMetric(backtype.storm.generated.TopologyMetric) TreeMap(java.util.TreeMap) FailedAssignTopologyException(com.alibaba.jstorm.utils.FailedAssignTopologyException) KeyNotFoundException(backtype.storm.generated.KeyNotFoundException) TException(org.apache.thrift.TException) IOException(java.io.IOException) AlreadyAliveException(backtype.storm.generated.AlreadyAliveException) TopologyAssignException(backtype.storm.generated.TopologyAssignException) FileNotFoundException(java.io.FileNotFoundException) NotAliveException(backtype.storm.generated.NotAliveException) InvalidTopologyException(backtype.storm.generated.InvalidTopologyException) KeyAlreadyExistsException(backtype.storm.generated.KeyAlreadyExistsException) TaskHeartbeat(backtype.storm.generated.TaskHeartbeat) StormClusterState(com.alibaba.jstorm.cluster.StormClusterState) MetricInfo(backtype.storm.generated.MetricInfo) TaskSummary(backtype.storm.generated.TaskSummary) TopologyInfo(backtype.storm.generated.TopologyInfo)

Example 3 with TaskHeartbeat

use of backtype.storm.generated.TaskHeartbeat in project jstorm by alibaba.

the class NimbusUtils method isTaskDead.

public static boolean isTaskDead(NimbusData data, String topologyId, Integer taskId) {
    String idStr = " topology:" + topologyId + ",task id:" + taskId;
    TopologyTaskHbInfo topoTasksHbInfo = data.getTasksHeartbeat().get(topologyId);
    Map<Integer, TaskHeartbeat> taskHbMap = null;
    Integer taskReportTime = null;
    if (topoTasksHbInfo != null) {
        taskHbMap = topoTasksHbInfo.get_taskHbs();
        if (taskHbMap != null) {
            TaskHeartbeat tHb = taskHbMap.get(taskId);
            taskReportTime = ((tHb != null) ? tHb.get_time() : null);
        }
    }
    Map<Integer, TkHbCacheTime> taskHBs = data.getTaskHeartbeatsCache(topologyId, true);
    TkHbCacheTime taskHB = taskHBs.get(taskId);
    if (taskHB == null) {
        LOG.debug("No task heartbeat cache " + idStr);
        if (topoTasksHbInfo == null || taskHbMap == null) {
            LOG.info("No task heartbeat was reported for " + idStr);
            return true;
        }
        taskHB = new TkHbCacheTime();
        taskHB.update(taskHbMap.get(taskId));
        taskHBs.put(taskId, taskHB);
        return false;
    }
    if (taskReportTime == null || taskReportTime < taskHB.getTaskAssignedTime()) {
        LOG.debug("No task heartbeat was reported for " + idStr);
        // Task hasn't finish init
        int nowSecs = TimeUtils.current_time_secs();
        int assignSecs = taskHB.getTaskAssignedTime();
        // default to 4 min
        int waitInitTimeout = JStormUtils.parseInt(data.getConf().get(Config.NIMBUS_TASK_LAUNCH_SECS));
        if (nowSecs - assignSecs > waitInitTimeout) {
            LOG.info(idStr + " failed to init ");
            return true;
        } else {
            return false;
        }
    }
    // the left is zkReportTime isn't null
    // task has finished initialization
    int nimbusTime = taskHB.getNimbusTime();
    int reportTime = taskHB.getTaskReportedTime();
    int nowSecs = TimeUtils.current_time_secs();
    if (nimbusTime == 0) {
        // taskHB no entry, first time
        // update taskHBtaskReportTime
        taskHB.setNimbusTime(nowSecs);
        taskHB.setTaskReportedTime(taskReportTime);
        LOG.info("Update task heartbeat to nimbus cache " + idStr);
        return false;
    }
    if (reportTime != taskReportTime) {
        // zk has been updated the report time
        taskHB.setNimbusTime(nowSecs);
        taskHB.setTaskReportedTime(taskReportTime);
        LOG.debug(idStr + ",nimbusTime " + nowSecs + ",zkReport:" + taskReportTime + ",report:" + reportTime);
        return false;
    }
    // the following is (zkReportTime == reportTime)
    Integer taskHBTimeout = data.getTopologyTaskTimeout().get(topologyId);
    if (taskHBTimeout == null)
        // default to 2 min
        taskHBTimeout = JStormUtils.parseInt(data.getConf().get(Config.NIMBUS_TASK_TIMEOUT_SECS));
    if (taskId == topoTasksHbInfo.get_topologyMasterId())
        taskHBTimeout = (taskHBTimeout / 2);
    if (nowSecs - nimbusTime > taskHBTimeout) {
        // task is dead
        long ts = ((long) nimbusTime) * 1000;
        Date lastTaskHBDate = new Date(ts);
        LOG.debug(idStr + " last task time is " + nimbusTime + ":" + lastTaskHBDate + ",current " + nowSecs + ":" + new Date(((long) nowSecs) * 1000));
        return true;
    }
    return false;
}
Also used : TaskHeartbeat(backtype.storm.generated.TaskHeartbeat) TopologyTaskHbInfo(backtype.storm.generated.TopologyTaskHbInfo) TkHbCacheTime(com.alibaba.jstorm.task.TkHbCacheTime) Date(java.util.Date)

Example 4 with TaskHeartbeat

use of backtype.storm.generated.TaskHeartbeat in project jstorm by alibaba.

the class MonitorRunnable method run.

/**
 * Todo: when one topology is being reassigned, the topology should skip check
 */
@Override
public void run() {
    StormClusterState clusterState = data.getStormClusterState();
    try {
        // Note: need first check Assignments
        List<String> activeTopologies = clusterState.assignments(null);
        if (activeTopologies == null) {
            LOG.info("Failed to get active topologies");
            return;
        }
        for (String topologyId : activeTopologies) {
            if (clusterState.storm_base(topologyId, null) == null) {
                continue;
            }
            LOG.debug("Check tasks of topology " + topologyId);
            // Note that we don't check /ZK-dir/taskbeats/topologyId to get task ids
            Set<Integer> taskIds = clusterState.task_ids(topologyId);
            if (taskIds == null) {
                LOG.info("Failed to get task ids of " + topologyId);
                continue;
            }
            Assignment assignment = clusterState.assignment_info(topologyId, null);
            Set<Integer> deadTasks = new HashSet<>();
            boolean needReassign = false;
            for (Integer task : taskIds) {
                boolean isTaskDead = NimbusUtils.isTaskDead(data, topologyId, task);
                if (isTaskDead) {
                    deadTasks.add(task);
                    needReassign = true;
                }
            }
            TopologyTaskHbInfo topologyHbInfo = data.getTasksHeartbeat().get(topologyId);
            if (needReassign) {
                if (topologyHbInfo != null) {
                    int topologyMasterId = topologyHbInfo.get_topologyMasterId();
                    if (deadTasks.contains(topologyMasterId)) {
                        deadTasks.clear();
                        if (assignment != null) {
                            ResourceWorkerSlot resource = assignment.getWorkerByTaskId(topologyMasterId);
                            if (resource != null)
                                deadTasks.addAll(resource.getTasks());
                            else
                                deadTasks.add(topologyMasterId);
                        }
                    } else {
                        Map<Integer, TaskHeartbeat> taskHbs = topologyHbInfo.get_taskHbs();
                        int launchTime = JStormUtils.parseInt(data.getConf().get(Config.NIMBUS_TASK_LAUNCH_SECS));
                        if (taskHbs == null || taskHbs.get(topologyMasterId) == null || taskHbs.get(topologyMasterId).get_uptime() < launchTime) {
                            /*try {
                                    clusterState.topology_heartbeat(topologyId, topologyHbInfo);
                                } catch (Exception e) {
                                    LOG.error("Failed to update task heartbeat info to ZK for " + topologyId, e);
                                }*/
                            return;
                        }
                    }
                    Map<Integer, ResourceWorkerSlot> deadTaskWorkers = new HashMap<>();
                    for (Integer task : deadTasks) {
                        LOG.info("Found " + topologyId + ", taskId:" + task + " is dead");
                        ResourceWorkerSlot resource = null;
                        if (assignment != null)
                            resource = assignment.getWorkerByTaskId(task);
                        if (resource != null) {
                            deadTaskWorkers.put(task, resource);
                        }
                    }
                    Map<ResourceWorkerSlot, List<Integer>> workersDeadTasks = JStormUtils.reverse_map(deadTaskWorkers);
                    for (Map.Entry<ResourceWorkerSlot, List<Integer>> entry : workersDeadTasks.entrySet()) {
                        ResourceWorkerSlot resource = entry.getKey();
                        // we only report one task
                        for (Integer task : entry.getValue()) {
                            Date now = new Date();
                            String nowStr = TimeFormat.getSecond(now);
                            String errorInfo = "Task-" + entry.getValue().toString() + " is dead on " + resource.getHostname() + ":" + resource.getPort() + ", " + nowStr;
                            LOG.info(errorInfo);
                            clusterState.report_task_error(topologyId, task, errorInfo, ErrorConstants.ERROR, ErrorConstants.CODE_TASK_DEAD, ErrorConstants.DURATION_SECS_TASK_DEAD);
                            break;
                        }
                    }
                    if (deadTaskWorkers.size() > 0) {
                        // notify jstorm monitor
                        TaskDeadEvent.pushEvent(topologyId, deadTaskWorkers);
                    }
                }
                NimbusUtils.transition(data, topologyId, false, StatusType.monitor);
            }
            if (topologyHbInfo != null) {
                try {
                    clusterState.topology_heartbeat(topologyId, topologyHbInfo);
                } catch (Exception e) {
                    LOG.error("Failed to update task heartbeat info to ZK for " + topologyId, e);
                }
            }
        }
    } catch (Exception e) {
        LOG.error(e.getMessage(), e);
    }
}
Also used : HashMap(java.util.HashMap) TopologyTaskHbInfo(backtype.storm.generated.TopologyTaskHbInfo) Date(java.util.Date) TaskHeartbeat(backtype.storm.generated.TaskHeartbeat) StormClusterState(com.alibaba.jstorm.cluster.StormClusterState) List(java.util.List) HashMap(java.util.HashMap) Map(java.util.Map) HashSet(java.util.HashSet) ResourceWorkerSlot(com.alibaba.jstorm.schedule.default_assign.ResourceWorkerSlot)

Example 5 with TaskHeartbeat

use of backtype.storm.generated.TaskHeartbeat in project jstorm by alibaba.

the class ServiceHandler method updateTaskHeartbeat.

@Override
public void updateTaskHeartbeat(TopologyTaskHbInfo taskHbs) throws TException {
    String topologyId = taskHbs.get_topologyId();
    Integer topologyMasterId = taskHbs.get_topologyMasterId();
    TopologyTaskHbInfo nimbusTaskHbs = data.getTasksHeartbeat().get(topologyId);
    if (nimbusTaskHbs == null) {
        nimbusTaskHbs = new TopologyTaskHbInfo(topologyId, topologyMasterId);
        data.getTasksHeartbeat().put(topologyId, nimbusTaskHbs);
    }
    Map<Integer, TaskHeartbeat> nimbusTaskHbMap = nimbusTaskHbs.get_taskHbs();
    if (nimbusTaskHbMap == null) {
        nimbusTaskHbMap = new ConcurrentHashMap<>();
        nimbusTaskHbs.set_taskHbs(nimbusTaskHbMap);
    }
    Map<Integer, TaskHeartbeat> taskHbMap = taskHbs.get_taskHbs();
    if (taskHbMap != null) {
        for (Entry<Integer, TaskHeartbeat> entry : taskHbMap.entrySet()) {
            nimbusTaskHbMap.put(entry.getKey(), entry.getValue());
        }
    }
}
Also used : TaskHeartbeat(backtype.storm.generated.TaskHeartbeat) TopologyTaskHbInfo(backtype.storm.generated.TopologyTaskHbInfo)

Aggregations

TaskHeartbeat (backtype.storm.generated.TaskHeartbeat)5 TopologyTaskHbInfo (backtype.storm.generated.TopologyTaskHbInfo)5 StormClusterState (com.alibaba.jstorm.cluster.StormClusterState)2 ResourceWorkerSlot (com.alibaba.jstorm.schedule.default_assign.ResourceWorkerSlot)2 Date (java.util.Date)2 HashMap (java.util.HashMap)2 List (java.util.List)2 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)2 AlreadyAliveException (backtype.storm.generated.AlreadyAliveException)1 ComponentSummary (backtype.storm.generated.ComponentSummary)1 ErrorInfo (backtype.storm.generated.ErrorInfo)1 InvalidTopologyException (backtype.storm.generated.InvalidTopologyException)1 KeyAlreadyExistsException (backtype.storm.generated.KeyAlreadyExistsException)1 KeyNotFoundException (backtype.storm.generated.KeyNotFoundException)1 MetricInfo (backtype.storm.generated.MetricInfo)1 NotAliveException (backtype.storm.generated.NotAliveException)1 TaskSummary (backtype.storm.generated.TaskSummary)1 TopologyAssignException (backtype.storm.generated.TopologyAssignException)1 TopologyInfo (backtype.storm.generated.TopologyInfo)1 TopologyMetric (backtype.storm.generated.TopologyMetric)1