use of com.alibaba.jstorm.schedule.default_assign.ResourceWorkerSlot in project jstorm by alibaba.
the class ServiceHandler method getSupervisorWorkersByHostOrId.
/**
* gets supervisor workers by host or supervisor id, note that id priors to host.
*
* @param host host
* @param id supervisor id
* @return supervisor workers
*/
private SupervisorWorkers getSupervisorWorkersByHostOrId(String host, String id) throws TException {
long start = System.nanoTime();
if (StringUtils.isBlank(id) && StringUtils.isBlank(host)) {
throw new TException("Must specify host or supervisor id!");
}
try {
StormClusterState stormClusterState = data.getStormClusterState();
// all supervisors
Map<String, SupervisorInfo> supervisorInfos = Cluster.get_all_SupervisorInfo(stormClusterState, null);
SupervisorInfo supervisorInfo = null;
String ip;
if (!StringUtils.isBlank(id)) {
supervisorInfo = supervisorInfos.get(id);
host = supervisorInfo.getHostName();
ip = NetWorkUtils.host2Ip(host);
} else {
ip = NetWorkUtils.host2Ip(host);
for (Entry<String, SupervisorInfo> entry : supervisorInfos.entrySet()) {
SupervisorInfo info = entry.getValue();
if (info.getHostName().equals(host) || info.getHostName().equals(ip)) {
id = entry.getKey();
supervisorInfo = info;
break;
}
}
}
if (supervisorInfo == null) {
throw new TException("unknown supervisor id:" + id);
}
Map<String, Assignment> assignments = Cluster.get_all_assignment(stormClusterState, null);
Map<Integer, WorkerSummary> portWorkerSummaries = new TreeMap<>();
int usedSlotNumber = 0;
Map<String, Map<Integer, String>> topologyTaskToComponent = new HashMap<>();
Map<String, MetricInfo> metricInfoMap = new HashMap<>();
for (Entry<String, Assignment> entry : assignments.entrySet()) {
String topologyId = entry.getKey();
Assignment assignment = entry.getValue();
Set<ResourceWorkerSlot> workers = assignment.getWorkers();
for (ResourceWorkerSlot worker : workers) {
if (!id.equals(worker.getNodeId())) {
continue;
}
usedSlotNumber++;
Integer port = worker.getPort();
WorkerSummary workerSummary = portWorkerSummaries.get(port);
if (workerSummary == null) {
workerSummary = new WorkerSummary();
workerSummary.set_port(port);
workerSummary.set_topology(topologyId);
workerSummary.set_tasks(new ArrayList<TaskComponent>());
portWorkerSummaries.put(port, workerSummary);
}
Map<Integer, String> taskToComponent = topologyTaskToComponent.get(topologyId);
if (taskToComponent == null) {
taskToComponent = Cluster.get_all_task_component(stormClusterState, topologyId, null);
topologyTaskToComponent.put(topologyId, taskToComponent);
}
int earliest = TimeUtils.current_time_secs();
for (Integer taskId : worker.getTasks()) {
TaskComponent taskComponent = new TaskComponent();
taskComponent.set_component(taskToComponent.get(taskId));
taskComponent.set_taskId(taskId);
Integer startTime = assignment.getTaskStartTimeSecs().get(taskId);
if (startTime != null && startTime < earliest) {
earliest = startTime;
}
workerSummary.add_to_tasks(taskComponent);
}
workerSummary.set_uptime(TimeUtils.time_delta(earliest));
String workerSlotName = getWorkerSlotName(ip, port);
List<MetricInfo> workerMetricInfoList = this.data.getMetricCache().getMetricData(topologyId, MetaType.WORKER);
if (workerMetricInfoList.size() > 0) {
MetricInfo workerMetricInfo = workerMetricInfoList.get(0);
// remove metrics that don't belong to current worker
for (Iterator<String> itr = workerMetricInfo.get_metrics().keySet().iterator(); itr.hasNext(); ) {
String metricName = itr.next();
if (!metricName.contains(ip)) {
itr.remove();
}
}
metricInfoMap.put(workerSlotName, workerMetricInfo);
}
}
}
List<WorkerSummary> workerList = new ArrayList<>();
workerList.addAll(portWorkerSummaries.values());
Map<String, Integer> supervisorToUsedSlotNum = new HashMap<>();
supervisorToUsedSlotNum.put(id, usedSlotNumber);
SupervisorSummary supervisorSummary = NimbusUtils.mkSupervisorSummary(supervisorInfo, id, supervisorToUsedSlotNum);
return new SupervisorWorkers(supervisorSummary, workerList, metricInfoMap);
} catch (TException e) {
LOG.info("Failed to get ClusterSummary ", e);
throw e;
} catch (Exception e) {
LOG.info("Failed to get ClusterSummary ", e);
throw new TException(e);
} finally {
long end = System.nanoTime();
SimpleJStormMetric.updateNimbusHistogram("getSupervisorWorkers", (end - start) / TimeUtils.NS_PER_US);
}
}
use of com.alibaba.jstorm.schedule.default_assign.ResourceWorkerSlot in project jstorm by alibaba.
the class ServiceHandler method getTopologyInfo.
/**
* Get TopologyInfo, it contain all topology running data
*
* @return TopologyInfo
*/
@Override
public TopologyInfo getTopologyInfo(String topologyId) throws TException {
long start = System.nanoTime();
StormClusterState stormClusterState = data.getStormClusterState();
try {
// get topology's StormBase
StormBase base = stormClusterState.storm_base(topologyId, null);
if (base == null) {
throw new NotAliveException("No topology of " + topologyId);
}
Assignment assignment = stormClusterState.assignment_info(topologyId, null);
if (assignment == null) {
throw new NotAliveException("No topology of " + topologyId);
}
TopologyTaskHbInfo topologyTaskHbInfo = data.getTasksHeartbeat().get(topologyId);
Map<Integer, TaskHeartbeat> taskHbMap = null;
if (topologyTaskHbInfo != null)
taskHbMap = topologyTaskHbInfo.get_taskHbs();
Map<Integer, TaskInfo> taskInfoMap = Cluster.get_all_taskInfo(stormClusterState, topologyId);
Map<Integer, String> taskToComponent = Cluster.get_all_task_component(stormClusterState, topologyId, taskInfoMap);
Map<Integer, String> taskToType = Cluster.get_all_task_type(stormClusterState, topologyId, taskInfoMap);
String errorString;
if (Cluster.is_topology_exist_error(stormClusterState, topologyId)) {
errorString = "Y";
} else {
errorString = "";
}
TopologySummary topologySummary = new TopologySummary();
topologySummary.set_id(topologyId);
topologySummary.set_name(base.getStormName());
topologySummary.set_uptimeSecs(TimeUtils.time_delta(base.getLanchTimeSecs()));
topologySummary.set_status(base.getStatusString());
topologySummary.set_numTasks(NimbusUtils.getTopologyTaskNum(assignment));
topologySummary.set_numWorkers(assignment.getWorkers().size());
topologySummary.set_errorInfo(errorString);
Map<String, ComponentSummary> componentSummaryMap = new HashMap<>();
HashMap<String, List<Integer>> componentToTasks = JStormUtils.reverse_map(taskToComponent);
for (Entry<String, List<Integer>> entry : componentToTasks.entrySet()) {
String name = entry.getKey();
List<Integer> taskIds = entry.getValue();
if (taskIds == null || taskIds.size() == 0) {
LOG.warn("No task of component " + name);
continue;
}
ComponentSummary componentSummary = new ComponentSummary();
componentSummaryMap.put(name, componentSummary);
componentSummary.set_name(name);
componentSummary.set_type(taskToType.get(taskIds.get(0)));
componentSummary.set_parallel(taskIds.size());
componentSummary.set_taskIds(taskIds);
}
Map<Integer, TaskSummary> taskSummaryMap = new TreeMap<>();
Map<Integer, List<TaskError>> taskErrors = Cluster.get_all_task_errors(stormClusterState, topologyId);
for (Integer taskId : taskInfoMap.keySet()) {
TaskSummary taskSummary = new TaskSummary();
taskSummaryMap.put(taskId, taskSummary);
taskSummary.set_taskId(taskId);
if (taskHbMap == null) {
taskSummary.set_status("Starting");
taskSummary.set_uptime(0);
} else {
TaskHeartbeat hb = taskHbMap.get(taskId);
if (hb == null) {
taskSummary.set_status("Starting");
taskSummary.set_uptime(0);
} else {
boolean isInactive = NimbusUtils.isTaskDead(data, topologyId, taskId);
if (isInactive)
taskSummary.set_status("INACTIVE");
else
taskSummary.set_status("ACTIVE");
taskSummary.set_uptime(hb.get_uptime());
}
}
if (StringUtils.isBlank(errorString)) {
continue;
}
List<TaskError> taskErrorList = taskErrors.get(taskId);
if (taskErrorList != null && taskErrorList.size() != 0) {
for (TaskError taskError : taskErrorList) {
ErrorInfo errorInfo = new ErrorInfo(taskError.getError(), taskError.getTimSecs(), taskError.getLevel(), taskError.getCode());
taskSummary.add_to_errors(errorInfo);
String component = taskToComponent.get(taskId);
componentSummaryMap.get(component).add_to_errors(errorInfo);
}
}
}
for (ResourceWorkerSlot workerSlot : assignment.getWorkers()) {
String hostname = workerSlot.getHostname();
int port = workerSlot.getPort();
for (Integer taskId : workerSlot.getTasks()) {
TaskSummary taskSummary = taskSummaryMap.get(taskId);
taskSummary.set_host(hostname);
taskSummary.set_port(port);
}
}
TopologyInfo topologyInfo = new TopologyInfo();
topologyInfo.set_topology(topologySummary);
topologyInfo.set_components(JStormUtils.mk_list(componentSummaryMap.values()));
topologyInfo.set_tasks(JStormUtils.mk_list(taskSummaryMap.values()));
// return topology metric & component metric only
List<MetricInfo> tpMetricList = data.getMetricCache().getMetricData(topologyId, MetaType.TOPOLOGY);
List<MetricInfo> compMetricList = data.getMetricCache().getMetricData(topologyId, MetaType.COMPONENT);
List<MetricInfo> workerMetricList = data.getMetricCache().getMetricData(topologyId, MetaType.WORKER);
List<MetricInfo> compStreamMetricList = data.getMetricCache().getMetricData(topologyId, MetaType.COMPONENT_STREAM);
MetricInfo taskMetric = MetricUtils.mkMetricInfo();
MetricInfo streamMetric = MetricUtils.mkMetricInfo();
MetricInfo nettyMetric = MetricUtils.mkMetricInfo();
MetricInfo tpMetric, compMetric, compStreamMetric, workerMetric;
if (tpMetricList == null || tpMetricList.size() == 0) {
tpMetric = MetricUtils.mkMetricInfo();
} else {
// get the last min topology metric
tpMetric = tpMetricList.get(tpMetricList.size() - 1);
}
if (compMetricList == null || compMetricList.size() == 0) {
compMetric = MetricUtils.mkMetricInfo();
} else {
compMetric = compMetricList.get(0);
}
if (compStreamMetricList == null || compStreamMetricList.size() == 0) {
compStreamMetric = MetricUtils.mkMetricInfo();
} else {
compStreamMetric = compStreamMetricList.get(0);
}
if (workerMetricList == null || workerMetricList.size() == 0) {
workerMetric = MetricUtils.mkMetricInfo();
} else {
workerMetric = workerMetricList.get(0);
}
TopologyMetric topologyMetrics = new TopologyMetric(tpMetric, compMetric, workerMetric, taskMetric, streamMetric, nettyMetric);
topologyMetrics.set_compStreamMetric(compStreamMetric);
topologyInfo.set_metrics(topologyMetrics);
return topologyInfo;
} catch (TException e) {
LOG.info("Failed to get topologyInfo " + topologyId, e);
throw e;
} catch (Exception e) {
LOG.info("Failed to get topologyInfo " + topologyId, e);
throw new TException("Failed to get topologyInfo" + topologyId);
} finally {
long end = System.nanoTime();
SimpleJStormMetric.updateNimbusHistogram("getTopologyInfo", (end - start) / TimeUtils.NS_PER_US);
}
}
use of com.alibaba.jstorm.schedule.default_assign.ResourceWorkerSlot in project jstorm by alibaba.
the class RefreshConnections method run.
@Override
public void run() {
try {
synchronized (this) {
Integer recordedVersion = zkCluster.assignment_version(topologyId, this);
boolean isUpdateAssignment = !(recordedVersion != null && recordedVersion.equals(assignmentVersion));
boolean isUpdateSupervisorTimeStamp = false;
Long localAssignmentTS = null;
try {
localAssignmentTS = StormConfig.read_supervisor_topology_timestamp(conf, topologyId);
isUpdateSupervisorTimeStamp = localAssignmentTS > workerData.getAssignmentTs();
} catch (FileNotFoundException e) {
LOG.warn("Failed to read supervisor topology timestamp for " + topologyId + " port=" + workerData.getPort(), e);
}
if (isUpdateAssignment || isUpdateSupervisorTimeStamp) {
LOG.info("update worker data due to changed assignment!!!");
Assignment assignment = zkCluster.assignment_info(topologyId, this);
if (assignment == null) {
String errMsg = "Failed to get assignment of " + topologyId;
LOG.error(errMsg);
// throw new RuntimeException(errMsg);
return;
}
// If so, the outbound task map should be updated accordingly.
if (isUpdateSupervisorTimeStamp) {
try {
if (assignment.getAssignmentType() == AssignmentType.UpdateTopology) {
LOG.info("Get config reload request for " + topologyId);
// If config was updated, notify all tasks
List<TaskShutdownDameon> taskShutdowns = workerData.getShutdownTasks();
Map newConf = StormConfig.read_supervisor_topology_conf(conf, topologyId);
workerData.getStormConf().putAll(newConf);
for (TaskShutdownDameon taskSD : taskShutdowns) {
taskSD.update(newConf);
}
// disable/enable metrics on the fly
workerData.getUpdateListener().update(newConf);
workerData.setAssignmentType(AssignmentType.UpdateTopology);
} else {
Set<Integer> addedTasks = getAddedTasks(assignment);
Set<Integer> removedTasks = getRemovedTasks(assignment);
Set<Integer> updatedTasks = getUpdatedTasks(assignment);
workerData.updateWorkerData(assignment);
workerData.updateKryoSerializer();
shutdownTasks(removedTasks);
createTasks(addedTasks);
updateTasks(updatedTasks);
Set<Integer> tmpOutboundTasks = Worker.worker_output_tasks(workerData);
if (!outboundTasks.equals(tmpOutboundTasks)) {
for (int taskId : tmpOutboundTasks) {
if (!outboundTasks.contains(taskId))
workerData.addOutboundTaskStatusIfAbsent(taskId);
}
for (int taskId : workerData.getOutboundTaskStatus().keySet()) {
if (!tmpOutboundTasks.contains(taskId)) {
workerData.removeOutboundTaskStatus(taskId);
}
}
workerData.setOutboundTasks(tmpOutboundTasks);
outboundTasks = tmpOutboundTasks;
}
workerData.setAssignmentType(AssignmentType.Assign);
}
// the tasks will update the related data.
if (localAssignmentTS != null)
workerData.setAssignmentTs(localAssignmentTS);
} catch (Exception e) {
LOG.warn("Failed to update worker data", e);
}
}
Set<ResourceWorkerSlot> workers = assignment.getWorkers();
if (workers == null) {
String errMsg = "Failed to get worker slots of " + topologyId;
LOG.error(errMsg);
return;
}
workerData.updateWorkerToResource(workers);
Map<Integer, WorkerSlot> taskNodePortTmp = new HashMap<>();
Map<String, String> node = assignment.getNodeHost();
// only reserve outboundTasks
Set<ResourceWorkerSlot> needConnections = new HashSet<>();
Set<Integer> localTasks = new HashSet<>();
Set<Integer> localNodeTasks = new HashSet<>();
if (outboundTasks != null) {
for (ResourceWorkerSlot worker : workers) {
if (supervisorId.equals(worker.getNodeId()))
localNodeTasks.addAll(worker.getTasks());
if (supervisorId.equals(worker.getNodeId()) && worker.getPort() == workerData.getPort())
localTasks.addAll(worker.getTasks());
for (Integer id : worker.getTasks()) {
taskNodePortTmp.put(id, worker);
if (outboundTasks.contains(id)) {
needConnections.add(worker);
}
}
}
}
taskToNodePort.putAll(taskNodePortTmp);
// workerData.setLocalTasks(localTasks);
workerData.setLocalNodeTasks(localNodeTasks);
// get which connection need to be remove or add
Set<WorkerSlot> currentConnections = nodePortToSocket.keySet();
Set<ResourceWorkerSlot> newConnections = new HashSet<>();
Set<WorkerSlot> removeConnections = new HashSet<>();
for (ResourceWorkerSlot nodePort : needConnections) {
if (!currentConnections.contains(nodePort)) {
newConnections.add(nodePort);
}
}
for (WorkerSlot node_port : currentConnections) {
if (!needConnections.contains(node_port)) {
removeConnections.add(node_port);
}
}
// create new connection
for (ResourceWorkerSlot nodePort : newConnections) {
String host = node.get(nodePort.getNodeId());
int port = nodePort.getPort();
IConnection conn = context.connect(topologyId, host, port, workerData.getTaskIds(), nodePort.getTasks());
nodePortToSocket.put(nodePort, conn);
LOG.info("Add connection to " + nodePort);
}
// close useless connection
for (WorkerSlot node_port : removeConnections) {
LOG.info("Remove connection to " + node_port);
nodePortToSocket.remove(node_port).close();
}
}
// check the status of connections to all outbound tasks
boolean allConnectionReady = true;
for (Integer taskId : outboundTasks) {
boolean isConnected = isOutTaskConnected(taskId);
if (!isConnected)
allConnectionReady = isConnected;
workerData.updateOutboundTaskStatus(taskId, isConnected);
}
if (allConnectionReady) {
workerData.getWorkerInitConnectionStatus().getAndSet(allConnectionReady);
}
if (recordedVersion != null)
assignmentVersion = recordedVersion;
}
} catch (Exception e) {
LOG.error("Failed to refresh worker connections", e);
throw new RuntimeException(e);
}
}
use of com.alibaba.jstorm.schedule.default_assign.ResourceWorkerSlot in project jstorm by alibaba.
the class TaskDeadEvent method run.
@Override
public void run() {
context.getMetricUploaderDelegate().sendEvent(context.getClusterName(), this);
// unregister dead workers
Set<ResourceWorkerSlot> workers = new HashSet<>();
workers.addAll(deadTasks.values());
for (ResourceWorkerSlot worker : workers) {
context.getMetricCache().unregisterWorker(topologyId, worker.getHostname(), worker.getPort());
}
}
use of com.alibaba.jstorm.schedule.default_assign.ResourceWorkerSlot in project jstorm by alibaba.
the class MetricRegister method broadcast.
public void broadcast() {
if (pending.compareAndSet(false, true)) {
try {
Set<String> oldMetricsNames = metricNames.getAndSet(new HashSet<String>());
if (oldMetricsNames.size() > 0) {
LOG.debug("register metrics to nimbus from TM, size:{}", oldMetricsNames.size());
Map<String, Long> nameIdMap = metricsRegister.registerMetrics(oldMetricsNames);
LOG.debug("register metrics to nimbus from TM, ret size:{}", nameIdMap.size());
if (nameIdMap.size() > 0) {
// topologies, might be quite large
for (ResourceWorkerSlot worker : tmContext.getWorkerSet().get()) {
Set<Integer> tasks = worker.getTasks();
int task = tasks.iterator().next();
tmContext.getCollector().getDelegate().emitDirect(task, Common.TOPOLOGY_MASTER_REGISTER_METRICS_RESP_STREAM_ID, null, new Values(nameIdMap));
}
}
}
} catch (Throwable e) {
LOG.error("Error:", e);
} finally {
pending.set(false);
}
} else {
LOG.warn("pending register metrics, skip...");
}
}
Aggregations