Search in sources :

Example 1 with ProcessNodeEventData

use of com.alibaba.otter.shared.arbitrate.model.ProcessNodeEventData in project otter by alibaba.

the class SelectZooKeeperArbitrateEvent method markUsed.

/**
     * 标记一下当前process为已使用
     */
private void markUsed(EtlEventData data) throws ZkNoNodeException, ZkException {
    String path = StagePathUtils.getProcess(data.getPipelineId(), data.getProcessId());
    // 序列化
    ProcessNodeEventData eventData = new ProcessNodeEventData();
    Long nid = ArbitrateConfigUtils.getCurrentNid();
    eventData.setNid(nid);
    // 标记为已使用
    eventData.setStatus(ProcessNodeEventData.Status.USED);
    // 直接声明为zookeeper模式
    eventData.setMode(ArbitrateMode.ZOOKEEPER);
    byte[] bytes = JsonUtils.marshalToByte(eventData);
    zookeeper.writeData(path, bytes);
}
Also used : ProcessNodeEventData(com.alibaba.otter.shared.arbitrate.model.ProcessNodeEventData)

Example 2 with ProcessNodeEventData

use of com.alibaba.otter.shared.arbitrate.model.ProcessNodeEventData in project otter by alibaba.

the class SelectStageListener method recovery.

/**
     * 尝试载入一下上一次未使用的processId,可能发生mainstem切换,新的S模块需要感知前S模块已创建但未使用的process,不然就是一个死锁。而针对已经使用的processId会由e/t/l节点进行处理
     */
private void recovery(Long pipelineId) {
    List<Long> currentProcessIds = stageMonitor.getCurrentProcessIds(false);
    for (Long processId : currentProcessIds) {
        String path = StagePathUtils.getProcess(pipelineId, processId);
        try {
            byte[] bytes = zookeeper.readData(path);
            ProcessNodeEventData nodeData = JsonUtils.unmarshalFromByte(bytes, ProcessNodeEventData.class);
            if (nodeData.getStatus().isUnUsed()) {
                // 加入未使用的processId
                addReply(processId);
            }
        } catch (ZkException e) {
            logger.error("SelectStageListener", e);
        }
    }
}
Also used : ZkException(org.I0Itec.zkclient.exception.ZkException) ProcessNodeEventData(com.alibaba.otter.shared.arbitrate.model.ProcessNodeEventData)

Example 3 with ProcessNodeEventData

use of com.alibaba.otter.shared.arbitrate.model.ProcessNodeEventData in project otter by alibaba.

the class SelectProcessListener method processChanged.

public void processChanged(List<Long> processIds) {
    super.processChanged(processIds);
    // add by ljh at 2012-09-13,解决zookeeper ConnectionLoss问题
    for (Long processId : processIds) {
        if (!replyProcessIds.contains(processId)) {
            logger.warn("process is not in order, please check processId:{}", processId);
            addReply(processId);
        }
    }
    try {
        String path = StagePathUtils.getProcessRoot(getPipelineId());
        // 根据并行度创建任务
        int size = ArbitrateConfigUtils.getParallelism(getPipelineId()) - processIds.size();
        if (size > 0) {
            // 创建一个节点
            PermitMonitor permit = ArbitrateFactory.getInstance(getPipelineId(), PermitMonitor.class);
            if (permit.isPermit() == false) {
                // 如果非授权,则不做任何处理
                return;
            }
            String mainStemPath = StagePathUtils.getMainStem(getPipelineId());
            byte[] bytes = zookeeper.readData(mainStemPath, true);
            if (bytes == null) {
                return;
            }
            MainStemEventData eventData = JsonUtils.unmarshalFromByte(bytes, MainStemEventData.class);
            if (eventData.getNid().equals(ArbitrateConfigUtils.getCurrentNid()) == false) {
                // 如果非自己设置的mainStem,则不做任何处理
                return;
            }
            synchronized (this) {
                // 重新再取一次, dobble-check
                List<String> currentProcesses = zookeeper.getChildren(path);
                size = ArbitrateConfigUtils.getParallelism(getPipelineId()) - currentProcesses.size();
                if (size > 0) {
                    // 创建一个节点
                    ProcessNodeEventData nodeData = new ProcessNodeEventData();
                    // 标记为未使用
                    nodeData.setStatus(ProcessNodeEventData.Status.UNUSED);
                    nodeData.setMode(ArbitrateMode.RPC);
                    nodeData.setNid(ArbitrateConfigUtils.getCurrentNid());
                    byte[] nodeBytes = JsonUtils.marshalToByte(nodeData);
                    String processPath = zookeeper.create(path + "/", nodeBytes, CreateMode.PERSISTENT_SEQUENTIAL);
                    // 创建为顺序的节点
                    String processNode = StringUtils.substringAfterLast(processPath, "/");
                    // 添加到当前的process列表
                    Long processId = StagePathUtils.getProcessId(processNode);
                    addReply(processId);
                }
            }
        }
    } catch (ZkException e) {
        // 出现异常后进行一次recovery,读取一下当前最新值,解决出现ConnectionLoss时create成功问题
        recovery(getPipelineId());
        logger.error("add process error!", e);
    }
}
Also used : ZkException(org.I0Itec.zkclient.exception.ZkException) PermitMonitor(com.alibaba.otter.shared.arbitrate.impl.setl.monitor.PermitMonitor) MainStemEventData(com.alibaba.otter.shared.arbitrate.model.MainStemEventData) ProcessNodeEventData(com.alibaba.otter.shared.arbitrate.model.ProcessNodeEventData)

Example 4 with ProcessNodeEventData

use of com.alibaba.otter.shared.arbitrate.model.ProcessNodeEventData in project otter by alibaba.

the class ArbitrateViewServiceImpl method listProcesses.

public List<ProcessStat> listProcesses(Long channelId, Long pipelineId) {
    List<ProcessStat> processStats = new ArrayList<ProcessStat>();
    String processRoot = ManagePathUtils.getProcessRoot(channelId, pipelineId);
    IZkConnection connection = zookeeper.getConnection();
    // zkclient会将获取stat信息和正常的操作分开,使用原生的zk进行优化
    ZooKeeper orginZk = ((ZooKeeperx) connection).getZookeeper();
    // 获取所有的process列表
    List<String> processNodes = zookeeper.getChildren(processRoot);
    List<Long> processIds = new ArrayList<Long>();
    for (String processNode : processNodes) {
        processIds.add(ManagePathUtils.getProcessId(processNode));
    }
    Collections.sort(processIds);
    for (int i = 0; i < processIds.size(); i++) {
        Long processId = processIds.get(i);
        // 当前的process可能会有变化
        ProcessStat processStat = new ProcessStat();
        processStat.setPipelineId(pipelineId);
        processStat.setProcessId(processId);
        List<StageStat> stageStats = new ArrayList<StageStat>();
        processStat.setStageStats(stageStats);
        try {
            String processPath = ManagePathUtils.getProcess(channelId, pipelineId, processId);
            Stat zkProcessStat = new Stat();
            List<String> stages = orginZk.getChildren(processPath, false, zkProcessStat);
            Collections.sort(stages, new StageComparator());
            StageStat prev = null;
            for (String stage : stages) {
                // 循环每个process下的stage
                String stagePath = processPath + "/" + stage;
                Stat zkStat = new Stat();
                StageStat stageStat = new StageStat();
                stageStat.setPipelineId(pipelineId);
                stageStat.setProcessId(processId);
                byte[] bytes = orginZk.getData(stagePath, false, zkStat);
                if (bytes != null && bytes.length > 0) {
                    // 特殊处理zookeeper里的data信息,manager没有对应node中PipeKey的对象,所以导致反序列化会失败,需要特殊处理,删除'@'符号
                    String json = StringUtils.remove(new String(bytes, "UTF-8"), '@');
                    EtlEventData data = JsonUtils.unmarshalFromString(json, EtlEventData.class);
                    stageStat.setNumber(data.getNumber());
                    stageStat.setSize(data.getSize());
                    Map exts = new HashMap();
                    if (!CollectionUtils.isEmpty(data.getExts())) {
                        exts.putAll(data.getExts());
                    }
                    exts.put("currNid", data.getCurrNid());
                    exts.put("nextNid", data.getNextNid());
                    exts.put("desc", data.getDesc());
                    stageStat.setExts(exts);
                }
                if (prev != null) {
                    // 对应的start时间为上一个节点的结束时间
                    stageStat.setStartTime(prev.getEndTime());
                } else {
                    // process的最后修改时间,select
                    stageStat.setStartTime(zkProcessStat.getMtime());
                // await成功后会设置USED标志位
                }
                stageStat.setEndTime(zkStat.getMtime());
                if (ArbitrateConstants.NODE_SELECTED.equals(stage)) {
                    stageStat.setStage(StageType.SELECT);
                } else if (ArbitrateConstants.NODE_EXTRACTED.equals(stage)) {
                    stageStat.setStage(StageType.EXTRACT);
                } else if (ArbitrateConstants.NODE_TRANSFORMED.equals(stage)) {
                    stageStat.setStage(StageType.TRANSFORM);
                // } else if
                // (ArbitrateConstants.NODE_LOADED.equals(stage)) {
                // stageStat.setStage(StageType.LOAD);
                }
                prev = stageStat;
                stageStats.add(stageStat);
            }
            // 添加一个当前正在处理的
            StageStat currentStageStat = new StageStat();
            currentStageStat.setPipelineId(pipelineId);
            currentStageStat.setProcessId(processId);
            if (prev == null) {
                byte[] bytes = orginZk.getData(processPath, false, zkProcessStat);
                if (bytes == null || bytes.length == 0) {
                    // 直接认为未使用,忽略之
                    continue;
                }
                ProcessNodeEventData nodeData = JsonUtils.unmarshalFromByte(bytes, ProcessNodeEventData.class);
                if (nodeData.getStatus().isUnUsed()) {
                    // 跳过该process
                    continue;
                } else {
                    // select操作
                    currentStageStat.setStage(StageType.SELECT);
                    currentStageStat.setStartTime(zkProcessStat.getMtime());
                }
            } else {
                // 判断上一个节点,确定当前的stage
                StageType stage = prev.getStage();
                if (stage.isSelect()) {
                    currentStageStat.setStage(StageType.EXTRACT);
                } else if (stage.isExtract()) {
                    currentStageStat.setStage(StageType.TRANSFORM);
                } else if (stage.isTransform()) {
                    currentStageStat.setStage(StageType.LOAD);
                } else if (stage.isLoad()) {
                    // 已经是最后一个节点了
                    continue;
                }
                // 开始时间为上一个节点的结束时间
                currentStageStat.setStartTime(prev.getEndTime());
            }
            if (currentStageStat.getStage().isLoad()) {
                // load必须为第一个process节点
                if (i == 0) {
                    stageStats.add(currentStageStat);
                }
            } else {
                // 其他情况都添加
                stageStats.add(currentStageStat);
            }
        } catch (NoNodeException e) {
        // ignore
        } catch (KeeperException e) {
            throw new ArbitrateException(e);
        } catch (InterruptedException e) {
        // ignore
        } catch (UnsupportedEncodingException e) {
        // ignore
        }
        processStats.add(processStat);
    }
    return processStats;
}
Also used : NoNodeException(org.apache.zookeeper.KeeperException.NoNodeException) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) ProcessStat(com.alibaba.otter.shared.common.model.statistics.stage.ProcessStat) Stat(org.apache.zookeeper.data.Stat) StageStat(com.alibaba.otter.shared.common.model.statistics.stage.StageStat) StageType(com.alibaba.otter.shared.common.model.config.enums.StageType) ZooKeeperx(com.alibaba.otter.shared.common.utils.zookeeper.ZooKeeperx) ProcessNodeEventData(com.alibaba.otter.shared.arbitrate.model.ProcessNodeEventData) StageComparator(com.alibaba.otter.shared.arbitrate.impl.setl.helper.StageComparator) IZkConnection(org.I0Itec.zkclient.IZkConnection) UnsupportedEncodingException(java.io.UnsupportedEncodingException) EtlEventData(com.alibaba.otter.shared.arbitrate.model.EtlEventData) ZooKeeper(org.apache.zookeeper.ZooKeeper) ProcessStat(com.alibaba.otter.shared.common.model.statistics.stage.ProcessStat) ArbitrateException(com.alibaba.otter.shared.arbitrate.exception.ArbitrateException) StageStat(com.alibaba.otter.shared.common.model.statistics.stage.StageStat) HashMap(java.util.HashMap) Map(java.util.Map) KeeperException(org.apache.zookeeper.KeeperException)

Example 5 with ProcessNodeEventData

use of com.alibaba.otter.shared.arbitrate.model.ProcessNodeEventData in project otter by alibaba.

the class NormalTerminProcess method processDelete.

private boolean processDelete(TerminEventData data, boolean noStage, boolean retry) {
    Long pipelineId = data.getPipelineId();
    Long processId = data.getProcessId();
    boolean result = false;
    // process节点
    // 最后删除一下process节点
    String path = StagePathUtils.getProcess(pipelineId, processId);
    byte[] bytes = null;
    try {
        bytes = zookeeper.readData(path);
    } catch (ZkNoNodeException e) {
        // 说明节点已经被删除了,直接忽略
        return false;
    }
    ProcessNodeEventData nodeData = JsonUtils.unmarshalFromByte(bytes, ProcessNodeEventData.class);
    if (nodeData.getStatus().isUsed()) {
        if (noStage && nodeData.getMode().isZookeeper()) {
            // 针对rpc mode就是没有stage,不需要进行sleep
            // 处理一种case:
            // 针对两个并发操作,一个已经完成了s/e/t/l模块的所有delete,另一个刚好进来发现没有可delete的
            // 这时两个线程就一起进入createTermin操作,存在一些并发问题,针对这种case,需要错开一下
            // 不过这种情况可能会有误判,针对s模块没有处理完成,发起了一次rollback/shutdown操作就会碰上,概率比较小,忽略这种误判吧
            processDeleteFailed();
            // 再重新尝试访问一下process,看下是否已经被删除了
            return processDelete(data, false, retry);
        }
        // 在这段sleep的过程中,process可能还会跑一段,产生新的s/e/t节点,导致process删除失败,从而重复执行了createTermin
        if (!retry) {
            // modify at 2012-08-14 , 遇到一个并发bug
            // 1. 两个process a和b,a先执行完毕删除了process节点,b立马得到触发并在极端的时间内处理完成
            // 2. 最后的一个结果b创建的termin要早于a创建的termin,导致termin发送顺序不对
            // 这里修改为,先创建termin节点,再删除对应的process,触发下一个process,保证termin创建为顺序
            // 同样可以避免删除了process后,termin信号创建失败的问题
            // modify at 2012-09-06 , 遇到一个并发bug
            // 一个process只完成了s/e模块,然后进行shutdown操作,完成了termin节点创建,但在process delete时,老的process创建了t节点
            // 这时会出现process删除失败,从而触发进行一次retry操作,此时retry又会再一次创建了termin信号,导致调度出错
            // 所以这里做了一个控制,只有针对非retry模式下才会创建termin信号
            // 创建termin节点
            result = createTermin(data, pipelineId, processId);
        }
    }
    try {
        // 修改为false,已经有另一个线程添加了该节点
        result = zookeeper.deleteRecursive(StagePathUtils.getProcess(pipelineId, processId));
        if (!result) {
            // 做一次重试,可能做manager关闭的时侯,node节点还跑了一段,导致stage节点又创建了一个
            doProcess(data, true);
        }
    } catch (ZkInterruptedException e) {
        throw e;
    } catch (ZkException e) {
        // 做一次重试,可能做manager关闭的时侯,node节点还跑了一段,导致stage节点又创建了一个
        doProcess(data, true);
    }
    return result;
}
Also used : ZkNoNodeException(org.I0Itec.zkclient.exception.ZkNoNodeException) ZkException(org.I0Itec.zkclient.exception.ZkException) ProcessNodeEventData(com.alibaba.otter.shared.arbitrate.model.ProcessNodeEventData) ZkInterruptedException(org.I0Itec.zkclient.exception.ZkInterruptedException)

Aggregations

ProcessNodeEventData (com.alibaba.otter.shared.arbitrate.model.ProcessNodeEventData)8 ZkException (org.I0Itec.zkclient.exception.ZkException)5 PermitMonitor (com.alibaba.otter.shared.arbitrate.impl.setl.monitor.PermitMonitor)2 MainStemEventData (com.alibaba.otter.shared.arbitrate.model.MainStemEventData)2 ArbitrateException (com.alibaba.otter.shared.arbitrate.exception.ArbitrateException)1 StageComparator (com.alibaba.otter.shared.arbitrate.impl.setl.helper.StageComparator)1 EtlEventData (com.alibaba.otter.shared.arbitrate.model.EtlEventData)1 StageType (com.alibaba.otter.shared.common.model.config.enums.StageType)1 ProcessStat (com.alibaba.otter.shared.common.model.statistics.stage.ProcessStat)1 StageStat (com.alibaba.otter.shared.common.model.statistics.stage.StageStat)1 ZooKeeperx (com.alibaba.otter.shared.common.utils.zookeeper.ZooKeeperx)1 UnsupportedEncodingException (java.io.UnsupportedEncodingException)1 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 Map (java.util.Map)1 IZkConnection (org.I0Itec.zkclient.IZkConnection)1 ZkInterruptedException (org.I0Itec.zkclient.exception.ZkInterruptedException)1 ZkNoNodeException (org.I0Itec.zkclient.exception.ZkNoNodeException)1 KeeperException (org.apache.zookeeper.KeeperException)1 NoNodeException (org.apache.zookeeper.KeeperException.NoNodeException)1