Search in sources :

Example 31 with EventData

use of com.alibaba.otter.shared.etl.model.EventData in project otter by alibaba.

the class ViewExtractor method extract.

@Override
public void extract(DbBatch dbBatch) throws ExtractException {
    Assert.notNull(dbBatch);
    Assert.notNull(dbBatch.getRowBatch());
    Pipeline pipeline = getPipeline(dbBatch.getRowBatch().getIdentity().getPipelineId());
    List<DataMediaPair> dataMediaPairs = pipeline.getPairs();
    /**
         * Key = TableId<br>
         * Value = a List of this tableId's column need to sync<br>
         */
    Map<Long, List<ColumnPair>> viewColumnPairs = new HashMap<Long, List<ColumnPair>>();
    Map<Long, ColumnPairMode> viewColumnPairModes = new HashMap<Long, ColumnPairMode>();
    for (DataMediaPair dataMediaPair : dataMediaPairs) {
        List<ColumnPair> columnPairs = dataMediaPair.getColumnPairs();
        // 设置ColumnPairMode
        viewColumnPairModes.put(dataMediaPair.getSource().getId(), dataMediaPair.getColumnPairMode());
        // 如果没有columnPairs,则默认全字段同步,不做处理
        if (!CollectionUtils.isEmpty(columnPairs)) {
            viewColumnPairs.put(dataMediaPair.getSource().getId(), columnPairs);
        }
    }
    List<EventData> eventDatas = dbBatch.getRowBatch().getDatas();
    // 使用set,提升remove时的查找速度
    Set<EventData> removeDatas = new HashSet<EventData>();
    for (EventData eventData : eventDatas) {
        if (eventData.getEventType().isDdl()) {
            continue;
        }
        List<ColumnPair> columns = viewColumnPairs.get(eventData.getTableId());
        if (!CollectionUtils.isEmpty(columns)) {
            // 组装需要同步的Column
            ColumnPairMode mode = viewColumnPairModes.get(eventData.getTableId());
            eventData.setColumns(columnFilter(eventData.getColumns(), columns, mode));
            eventData.setKeys(columnFilter(eventData.getKeys(), columns, mode));
            if (!CollectionUtils.isEmpty(eventData.getOldKeys())) {
                eventData.setOldKeys(columnFilter(eventData.getOldKeys(), columns, mode));
            }
            if (CollectionUtils.isEmpty(eventData.getKeys())) {
                // 无主键,报错
                throw new ExtractException(String.format("eventData after viewExtractor has no pks , pls check! identity:%s, new eventData:%s", dbBatch.getRowBatch().getIdentity().toString(), eventData.toString()));
            }
            // update: 过滤后如果无字段(变更需要同步)和主键变更,则可以忽略之,避免sql语法错误
            if (eventData.getEventType().isUpdate() && (CollectionUtils.isEmpty(eventData.getColumns()) || CollectionUtils.isEmpty(eventData.getUpdatedColumns())) && CollectionUtils.isEmpty(eventData.getOldKeys())) {
                // 过滤之后无字段需要同步,并且不存在主键变更同步,则忽略该记录
                removeDatas.add(eventData);
            }
        }
    }
    if (!CollectionUtils.isEmpty(removeDatas)) {
        eventDatas.removeAll(removeDatas);
    }
}
Also used : ColumnPair(com.alibaba.otter.shared.common.model.config.data.ColumnPair) ColumnPairMode(com.alibaba.otter.shared.common.model.config.data.ColumnPairMode) ExtractException(com.alibaba.otter.node.etl.extract.exceptions.ExtractException) DataMediaPair(com.alibaba.otter.shared.common.model.config.data.DataMediaPair) HashMap(java.util.HashMap) EventData(com.alibaba.otter.shared.etl.model.EventData) Pipeline(com.alibaba.otter.shared.common.model.config.pipeline.Pipeline) ArrayList(java.util.ArrayList) List(java.util.List) HashSet(java.util.HashSet)

Example 32 with EventData

use of com.alibaba.otter.shared.etl.model.EventData in project otter by alibaba.

the class RowDataHttpPipe method getDbBatch.

// 处理对应的dbBatch
private DbBatch getDbBatch(HttpPipeKey key) {
    String dataUrl = key.getUrl();
    Pipeline pipeline = configClientService.findPipeline(key.getIdentity().getPipelineId());
    DataRetriever dataRetriever = dataRetrieverFactory.createRetriever(pipeline.getParameters().getRetriever(), dataUrl, downloadDir);
    File archiveFile = null;
    try {
        dataRetriever.connect();
        dataRetriever.doRetrieve();
        archiveFile = dataRetriever.getDataAsFile();
    } catch (Exception e) {
        dataRetriever.abort();
        throw new PipeException("download_error", e);
    } finally {
        dataRetriever.disconnect();
    }
    // 处理下有加密的数据
    if (StringUtils.isNotEmpty(key.getKey()) && StringUtils.isNotEmpty(key.getCrc())) {
        decodeFile(archiveFile, key.getKey(), key.getCrc());
    }
    InputStream input = null;
    JSONReader reader = null;
    try {
        input = new BufferedInputStream(new FileInputStream(archiveFile));
        DbBatch dbBatch = new DbBatch();
        byte[] lengthBytes = new byte[4];
        input.read(lengthBytes);
        int length = ByteUtils.bytes2int(lengthBytes);
        BatchProto.RowBatch rowbatchProto = BatchProto.RowBatch.parseFrom(new LimitedInputStream(input, length));
        // 构造原始的model对象
        RowBatch rowBatch = new RowBatch();
        rowBatch.setIdentity(build(rowbatchProto.getIdentity()));
        for (BatchProto.RowData rowDataProto : rowbatchProto.getRowsList()) {
            EventData eventData = new EventData();
            eventData.setPairId(rowDataProto.getPairId());
            eventData.setTableId(rowDataProto.getTableId());
            eventData.setTableName(rowDataProto.getTableName());
            eventData.setSchemaName(rowDataProto.getSchemaName());
            eventData.setEventType(EventType.valuesOf(rowDataProto.getEventType()));
            eventData.setExecuteTime(rowDataProto.getExecuteTime());
            // add by ljh at 2012-10-31
            if (StringUtils.isNotEmpty(rowDataProto.getSyncMode())) {
                eventData.setSyncMode(SyncMode.valuesOf(rowDataProto.getSyncMode()));
            }
            if (StringUtils.isNotEmpty(rowDataProto.getSyncConsistency())) {
                eventData.setSyncConsistency(SyncConsistency.valuesOf(rowDataProto.getSyncConsistency()));
            }
            // 处理主键
            List<EventColumn> keys = new ArrayList<EventColumn>();
            for (BatchProto.Column columnProto : rowDataProto.getKeysList()) {
                keys.add(buildColumn(columnProto));
            }
            eventData.setKeys(keys);
            // 处理old主键
            if (CollectionUtils.isEmpty(rowDataProto.getOldKeysList()) == false) {
                List<EventColumn> oldKeys = new ArrayList<EventColumn>();
                for (BatchProto.Column columnProto : rowDataProto.getOldKeysList()) {
                    oldKeys.add(buildColumn(columnProto));
                }
                eventData.setOldKeys(oldKeys);
            }
            // 处理具体的column value
            List<EventColumn> columns = new ArrayList<EventColumn>();
            for (BatchProto.Column columnProto : rowDataProto.getColumnsList()) {
                columns.add(buildColumn(columnProto));
            }
            eventData.setColumns(columns);
            eventData.setRemedy(rowDataProto.getRemedy());
            eventData.setSize(rowDataProto.getSize());
            eventData.setSql(rowDataProto.getSql());
            eventData.setDdlSchemaName(rowDataProto.getDdlSchemaName());
            eventData.setHint(rowDataProto.getHint());
            eventData.setWithoutSchema(rowDataProto.getWithoutSchema());
            // 添加到总记录
            rowBatch.merge(eventData);
        }
        dbBatch.setRowBatch(rowBatch);
        input.read(lengthBytes);
        length = ByteUtils.bytes2int(lengthBytes);
        BatchProto.FileBatch filebatchProto = BatchProto.FileBatch.parseFrom(new LimitedInputStream(input, length));
        // 构造原始的model对象
        FileBatch fileBatch = new FileBatch();
        fileBatch.setIdentity(build(filebatchProto.getIdentity()));
        for (BatchProto.FileData fileDataProto : filebatchProto.getFilesList()) {
            FileData fileData = new FileData();
            fileData.setPairId(fileDataProto.getPairId());
            fileData.setTableId(fileDataProto.getTableId());
            fileData.setEventType(EventType.valuesOf(fileDataProto.getEventType()));
            fileData.setLastModifiedTime(fileDataProto.getLastModifiedTime());
            fileData.setNameSpace(fileDataProto.getNamespace());
            fileData.setPath(fileDataProto.getPath());
            fileData.setSize(fileDataProto.getSize());
            // 添加到filebatch中
            fileBatch.getFiles().add(fileData);
        }
        dbBatch.setFileBatch(fileBatch);
        return dbBatch;
    } catch (IOException e) {
        throw new PipeException("deserial_error", e);
    } finally {
        IOUtils.closeQuietly(reader);
    }
}
Also used : EventColumn(com.alibaba.otter.shared.etl.model.EventColumn) ArrayList(java.util.ArrayList) DbBatch(com.alibaba.otter.shared.etl.model.DbBatch) EventData(com.alibaba.otter.shared.etl.model.EventData) BufferedInputStream(java.io.BufferedInputStream) FileData(com.alibaba.otter.shared.etl.model.FileData) FileBatch(com.alibaba.otter.shared.etl.model.FileBatch) BufferedInputStream(java.io.BufferedInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) DataRetriever(com.alibaba.otter.node.etl.common.io.download.DataRetriever) IOException(java.io.IOException) BatchProto(com.alibaba.otter.node.etl.model.protobuf.BatchProto) IOException(java.io.IOException) PipeException(com.alibaba.otter.node.etl.common.pipe.exception.PipeException) FileInputStream(java.io.FileInputStream) Pipeline(com.alibaba.otter.shared.common.model.config.pipeline.Pipeline) RowBatch(com.alibaba.otter.shared.etl.model.RowBatch) PipeException(com.alibaba.otter.node.etl.common.pipe.exception.PipeException) JSONReader(com.alibaba.fastjson.JSONReader) File(java.io.File)

Example 33 with EventData

use of com.alibaba.otter.shared.etl.model.EventData in project otter by alibaba.

the class DbLoadAction method doLoad.

private void doLoad(final DbLoadContext context, DbLoadData loadData) {
    // 优先处理delete,可以利用batch优化
    List<List<EventData>> batchDatas = new ArrayList<List<EventData>>();
    for (TableLoadData tableData : loadData.getTables()) {
        if (useBatch) {
            // 优先执行delete语句,针对uniqe更新,一般会进行delete + insert的处理模式,避免并发更新
            batchDatas.addAll(split(tableData.getDeleteDatas()));
        } else {
            // 优先执行delete语句,针对uniqe更新,一般会进行delete + insert的处理模式,避免并发更新
            for (EventData data : tableData.getDeleteDatas()) {
                batchDatas.add(Arrays.asList(data));
            }
        }
    }
    if (context.getPipeline().getParameters().isDryRun()) {
        doDryRun(context, batchDatas, true);
    } else {
        doTwoPhase(context, batchDatas, true);
    }
    batchDatas.clear();
    // 处理下insert/update
    for (TableLoadData tableData : loadData.getTables()) {
        if (useBatch) {
            // 执行insert + update语句
            batchDatas.addAll(split(tableData.getInsertDatas()));
            // 每条记录分为一组,并行加载
            batchDatas.addAll(split(tableData.getUpadateDatas()));
        } else {
            // 执行insert + update语句
            for (EventData data : tableData.getInsertDatas()) {
                batchDatas.add(Arrays.asList(data));
            }
            for (EventData data : tableData.getUpadateDatas()) {
                batchDatas.add(Arrays.asList(data));
            }
        }
    }
    if (context.getPipeline().getParameters().isDryRun()) {
        doDryRun(context, batchDatas, true);
    } else {
        doTwoPhase(context, batchDatas, true);
    }
    batchDatas.clear();
}
Also used : TableLoadData(com.alibaba.otter.node.etl.load.loader.db.DbLoadData.TableLoadData) ArrayList(java.util.ArrayList) List(java.util.List) ArrayList(java.util.ArrayList) EventData(com.alibaba.otter.shared.etl.model.EventData)

Example 34 with EventData

use of com.alibaba.otter.shared.etl.model.EventData in project otter by alibaba.

the class DbLoadAction method doTwoPhase.

/**
     * 首先进行并行执行,出错后转为串行执行
     */
private void doTwoPhase(DbLoadContext context, List<List<EventData>> totalRows, boolean canBatch) {
    // 预处理下数据
    List<Future<Exception>> results = new ArrayList<Future<Exception>>();
    for (List<EventData> rows : totalRows) {
        if (CollectionUtils.isEmpty(rows)) {
            // 过滤空记录
            continue;
        }
        results.add(executor.submit(new DbLoadWorker(context, rows, canBatch)));
    }
    boolean partFailed = false;
    for (int i = 0; i < results.size(); i++) {
        Future<Exception> result = results.get(i);
        Exception ex = null;
        try {
            ex = result.get();
            for (EventData data : totalRows.get(i)) {
                // 通知加载完成
                interceptor.after(context, data);
            }
        } catch (Exception e) {
            ex = e;
        }
        if (ex != null) {
            logger.warn("##load phase one failed!", ex);
            partFailed = true;
        }
    }
    if (true == partFailed) {
        // if (CollectionUtils.isEmpty(context.getFailedDatas())) {
        // logger.error("##load phase one failed but failedDatas is empty!");
        // return;
        // }
        // 尝试的内容换成phase one跑的所有数据,避免因failed datas计算错误而导致丢数据
        List<EventData> retryEventDatas = new ArrayList<EventData>();
        for (List<EventData> rows : totalRows) {
            retryEventDatas.addAll(rows);
        }
        // 清理failed data数据
        context.getFailedDatas().clear();
        // 可能为null,manager老版本数据序列化传输时,因为数据库中没有skipLoadException变量配置
        Boolean skipLoadException = context.getPipeline().getParameters().getSkipLoadException();
        if (skipLoadException != null && skipLoadException) {
            // 如果设置为允许跳过单条异常,则一条条执行数据load,准确过滤掉出错的记录,并进行日志记录
            for (EventData retryEventData : retryEventDatas) {
                // 强制设置batch为false
                DbLoadWorker worker = new DbLoadWorker(context, Arrays.asList(retryEventData), false);
                try {
                    Exception ex = worker.call();
                    if (ex != null) {
                        // do skip
                        logger.warn("skip exception for data : {} , caused by {}", retryEventData, ExceptionUtils.getFullStackTrace(ex));
                    }
                } catch (Exception ex) {
                    // do skip
                    logger.warn("skip exception for data : {} , caused by {}", retryEventData, ExceptionUtils.getFullStackTrace(ex));
                }
            }
        } else {
            // 直接一批进行处理,减少线程调度
            // 强制设置batch为false
            DbLoadWorker worker = new DbLoadWorker(context, retryEventDatas, false);
            try {
                Exception ex = worker.call();
                if (ex != null) {
                    // 自己抛自己接
                    throw ex;
                }
            } catch (Exception ex) {
                logger.error("##load phase two failed!", ex);
                throw new LoadException(ex);
            }
        }
        // 清理failed data数据
        for (EventData data : retryEventDatas) {
            // 通知加载完成
            interceptor.after(context, data);
        }
    }
}
Also used : ArrayList(java.util.ArrayList) Future(java.util.concurrent.Future) DataIntegrityViolationException(org.springframework.dao.DataIntegrityViolationException) DataAccessException(org.springframework.dao.DataAccessException) SQLException(java.sql.SQLException) DeadlockLoserDataAccessException(org.springframework.dao.DeadlockLoserDataAccessException) LoadException(com.alibaba.otter.node.etl.load.exception.LoadException) EventData(com.alibaba.otter.shared.etl.model.EventData) LoadException(com.alibaba.otter.node.etl.load.exception.LoadException)

Example 35 with EventData

use of com.alibaba.otter.shared.etl.model.EventData in project otter by alibaba.

the class SelectTask method processSelect.

private void processSelect() {
    while (running) {
        try {
            // 等待ProcessTermin exhaust,会阻塞
            // ProcessTermin发现出现rollback,会立即通知暂停,比分布式permit及时性高
            canStartSelector.get();
            // 判断当前是否为工作节点,S模块不能出现双节点工作,selector容易出现数据错乱
            if (needCheck) {
                checkContinueWork();
            }
            // 出现阻塞挂起时,等待mananger处理完成,解挂开启同步
            // 出现rollback后能及时停住
            arbitrateEventService.toolEvent().waitForPermit(pipelineId);
            // 使用startVersion要解决的一个问题:出现rollback时,尽可能判断取出来的数据是rollback前还是rollback后,想办法丢弃rollback前的数据。
            // (因为出现rollback,之前取出去的几个批次的数据其实是没有执行成功,get取出来的数据会是其后一批数据,如果不丢弃的话,会出现后面的数据先执行,然后又回到出错的点,再执行一遍)
            // int startVersion = rversion.get();
            Message gotMessage = otterSelector.selector();
            // modify by ljh at 2012-09-10,startVersion获取操作应该放在拿到数据之后
            // 放在前面 : (遇到一个并发bug)
            // // a.
            // 先拿startVersion,再获取数据,在拿数据过程中rollback开始并完成了,导致selector返回时数据已经取到了末尾
            // // b. 在进行version判断时发现已经有变化,导致又触发一次拿数据的过程,此时的get
            // cursor已经到队列的末尾,拿不出任何数据,所以出现死等情况
            // 放在后面 : (一点点瑕疵)
            // // a.
            // 并发操作rollback和selector时,针对拿到rollback前的老数据,此时startVersion还未初始化,导致判断不出出现过rollback操作,后面的变更数据会提前同步
            // (概率性会比较高,取决于selector和初始化startVersion的时间间隔)
            int startVersion = rversion.get();
            if (canStartSelector.state() == false) {
                // 是否出现异常
                // 回滚在出现异常的瞬间,拿出来的数据,因为otterSelector.selector()会循环,可能出现了rollback,其还未感知到
                rollback(gotMessage.getId());
                continue;
            }
            if (CollectionUtils.isEmpty(gotMessage.getDatas())) {
                // 处理下空数据,也得更新下游标,可能是回环数据被过滤掉
                // 添加到待响应的buffer列表,不需要await termin信号,因为没启动过s/e/t/l流程
                batchBuffer.put(new BatchTermin(gotMessage.getId(), false));
                continue;
            }
            final EtlEventData etlEventData = arbitrateEventService.selectEvent().await(pipelineId);
            if (rversion.get() != startVersion) {
                // 说明存在过变化,中间出现过rollback,需要丢弃该数据
                logger.warn("rollback happend , should skip this data and get new message.");
                // 确认一下rollback是否完成
                canStartSelector.get();
                // 这时不管有没有数据,都需要执行一次s/e/t/l
                gotMessage = otterSelector.selector();
            }
            final Message message = gotMessage;
            final BatchTermin batchTermin = new BatchTermin(message.getId(), etlEventData.getProcessId());
            // 添加到待响应的buffer列表
            batchBuffer.put(batchTermin);
            Runnable task = new Runnable() {

                public void run() {
                    // 设置profiling信息
                    boolean profiling = isProfiling();
                    Long profilingStartTime = null;
                    if (profiling) {
                        profilingStartTime = System.currentTimeMillis();
                    }
                    MDC.put(OtterConstants.splitPipelineLogFileKey, String.valueOf(pipelineId));
                    String currentName = Thread.currentThread().getName();
                    Thread.currentThread().setName(createTaskName(pipelineId, "SelectWorker"));
                    try {
                        pipeline = configClientService.findPipeline(pipelineId);
                        List<EventData> eventData = message.getDatas();
                        long startTime = etlEventData.getStartTime();
                        if (!CollectionUtils.isEmpty(eventData)) {
                            startTime = eventData.get(0).getExecuteTime();
                        }
                        Channel channel = configClientService.findChannelByPipelineId(pipelineId);
                        RowBatch rowBatch = new RowBatch();
                        // 构造唯一标识
                        Identity identity = new Identity();
                        identity.setChannelId(channel.getId());
                        identity.setPipelineId(pipelineId);
                        identity.setProcessId(etlEventData.getProcessId());
                        rowBatch.setIdentity(identity);
                        // 进行数据合并
                        for (EventData data : eventData) {
                            rowBatch.merge(data);
                        }
                        long nextNodeId = etlEventData.getNextNid();
                        List<PipeKey> pipeKeys = rowDataPipeDelegate.put(new DbBatch(rowBatch), nextNodeId);
                        etlEventData.setDesc(pipeKeys);
                        etlEventData.setNumber((long) eventData.size());
                        // 使用原始数据的第一条
                        etlEventData.setFirstTime(startTime);
                        etlEventData.setBatchId(message.getId());
                        if (profiling) {
                            Long profilingEndTime = System.currentTimeMillis();
                            stageAggregationCollector.push(pipelineId, StageType.SELECT, new AggregationItem(profilingStartTime, profilingEndTime));
                        }
                        arbitrateEventService.selectEvent().single(etlEventData);
                    } catch (Throwable e) {
                        if (!isInterrupt(e)) {
                            logger.error(String.format("[%s] selectwork executor is error! data:%s", pipelineId, etlEventData), e);
                            sendRollbackTermin(pipelineId, e);
                        } else {
                            logger.info(String.format("[%s] selectwork executor is interrrupt! data:%s", pipelineId, etlEventData), e);
                        }
                    } finally {
                        Thread.currentThread().setName(currentName);
                        MDC.remove(OtterConstants.splitPipelineLogFileKey);
                    }
                }
            };
            // 构造pending任务,可在关闭线程时退出任务
            SetlFuture extractFuture = new SetlFuture(StageType.SELECT, etlEventData.getProcessId(), pendingFuture, task);
            executorService.execute(extractFuture);
        } catch (Throwable e) {
            if (!isInterrupt(e)) {
                logger.error(String.format("[%s] selectTask is error!", pipelineId), e);
                sendRollbackTermin(pipelineId, e);
            } else {
                logger.info(String.format("[%s] selectTask is interrrupt!", pipelineId), e);
                return;
            }
        }
    }
}
Also used : Message(com.alibaba.otter.node.etl.select.selector.Message) Channel(com.alibaba.otter.shared.common.model.config.channel.Channel) PipeKey(com.alibaba.otter.node.etl.common.pipe.PipeKey) TerminEventData(com.alibaba.otter.shared.arbitrate.model.TerminEventData) EtlEventData(com.alibaba.otter.shared.arbitrate.model.EtlEventData) EventData(com.alibaba.otter.shared.etl.model.EventData) DbBatch(com.alibaba.otter.shared.etl.model.DbBatch) EtlEventData(com.alibaba.otter.shared.arbitrate.model.EtlEventData) RowBatch(com.alibaba.otter.shared.etl.model.RowBatch) AggregationItem(com.alibaba.otter.node.etl.common.jmx.StageAggregation.AggregationItem) Identity(com.alibaba.otter.shared.etl.model.Identity) SetlFuture(com.alibaba.otter.node.etl.extract.SetlFuture)

Aggregations

EventData (com.alibaba.otter.shared.etl.model.EventData)48 ArrayList (java.util.ArrayList)20 Pipeline (com.alibaba.otter.shared.common.model.config.pipeline.Pipeline)19 EventColumn (com.alibaba.otter.shared.etl.model.EventColumn)18 DataMediaPair (com.alibaba.otter.shared.common.model.config.data.DataMediaPair)16 Test (org.testng.annotations.Test)16 BaseDbTest (com.alibaba.otter.node.etl.BaseDbTest)15 RowBatch (com.alibaba.otter.shared.etl.model.RowBatch)14 Identity (com.alibaba.otter.shared.etl.model.Identity)9 MapMaker (com.google.common.collect.MapMaker)9 RowKey (com.alibaba.otter.node.etl.load.loader.db.DbLoadMerger.RowKey)8 DataMedia (com.alibaba.otter.shared.common.model.config.data.DataMedia)8 DbBatch (com.alibaba.otter.shared.etl.model.DbBatch)7 DbDialect (com.alibaba.otter.node.etl.common.db.dialect.DbDialect)5 ExtractException (com.alibaba.otter.node.etl.extract.exceptions.ExtractException)5 LoadException (com.alibaba.otter.node.etl.load.exception.LoadException)4 FileData (com.alibaba.otter.shared.etl.model.FileData)4 SelectException (com.alibaba.otter.node.etl.select.exceptions.SelectException)3 Channel (com.alibaba.otter.shared.common.model.config.channel.Channel)3 EventType (com.alibaba.otter.shared.etl.model.EventType)3