Search in sources :

Example 11 with RowBatch

use of com.alibaba.otter.shared.etl.model.RowBatch in project otter by alibaba.

the class RowDataHttpPipe method getDbBatch.

// 处理对应的dbBatch
private DbBatch getDbBatch(HttpPipeKey key) {
    String dataUrl = key.getUrl();
    Pipeline pipeline = configClientService.findPipeline(key.getIdentity().getPipelineId());
    DataRetriever dataRetriever = dataRetrieverFactory.createRetriever(pipeline.getParameters().getRetriever(), dataUrl, downloadDir);
    File archiveFile = null;
    try {
        dataRetriever.connect();
        dataRetriever.doRetrieve();
        archiveFile = dataRetriever.getDataAsFile();
    } catch (Exception e) {
        dataRetriever.abort();
        throw new PipeException("download_error", e);
    } finally {
        dataRetriever.disconnect();
    }
    // 处理下有加密的数据
    if (StringUtils.isNotEmpty(key.getKey()) && StringUtils.isNotEmpty(key.getCrc())) {
        decodeFile(archiveFile, key.getKey(), key.getCrc());
    }
    InputStream input = null;
    JSONReader reader = null;
    try {
        input = new BufferedInputStream(new FileInputStream(archiveFile));
        DbBatch dbBatch = new DbBatch();
        byte[] lengthBytes = new byte[4];
        input.read(lengthBytes);
        int length = ByteUtils.bytes2int(lengthBytes);
        BatchProto.RowBatch rowbatchProto = BatchProto.RowBatch.parseFrom(new LimitedInputStream(input, length));
        // 构造原始的model对象
        RowBatch rowBatch = new RowBatch();
        rowBatch.setIdentity(build(rowbatchProto.getIdentity()));
        for (BatchProto.RowData rowDataProto : rowbatchProto.getRowsList()) {
            EventData eventData = new EventData();
            eventData.setPairId(rowDataProto.getPairId());
            eventData.setTableId(rowDataProto.getTableId());
            eventData.setTableName(rowDataProto.getTableName());
            eventData.setSchemaName(rowDataProto.getSchemaName());
            eventData.setEventType(EventType.valuesOf(rowDataProto.getEventType()));
            eventData.setExecuteTime(rowDataProto.getExecuteTime());
            // add by ljh at 2012-10-31
            if (StringUtils.isNotEmpty(rowDataProto.getSyncMode())) {
                eventData.setSyncMode(SyncMode.valuesOf(rowDataProto.getSyncMode()));
            }
            if (StringUtils.isNotEmpty(rowDataProto.getSyncConsistency())) {
                eventData.setSyncConsistency(SyncConsistency.valuesOf(rowDataProto.getSyncConsistency()));
            }
            // 处理主键
            List<EventColumn> keys = new ArrayList<EventColumn>();
            for (BatchProto.Column columnProto : rowDataProto.getKeysList()) {
                keys.add(buildColumn(columnProto));
            }
            eventData.setKeys(keys);
            // 处理old主键
            if (CollectionUtils.isEmpty(rowDataProto.getOldKeysList()) == false) {
                List<EventColumn> oldKeys = new ArrayList<EventColumn>();
                for (BatchProto.Column columnProto : rowDataProto.getOldKeysList()) {
                    oldKeys.add(buildColumn(columnProto));
                }
                eventData.setOldKeys(oldKeys);
            }
            // 处理具体的column value
            List<EventColumn> columns = new ArrayList<EventColumn>();
            for (BatchProto.Column columnProto : rowDataProto.getColumnsList()) {
                columns.add(buildColumn(columnProto));
            }
            eventData.setColumns(columns);
            eventData.setRemedy(rowDataProto.getRemedy());
            eventData.setSize(rowDataProto.getSize());
            eventData.setSql(rowDataProto.getSql());
            eventData.setDdlSchemaName(rowDataProto.getDdlSchemaName());
            eventData.setHint(rowDataProto.getHint());
            eventData.setWithoutSchema(rowDataProto.getWithoutSchema());
            // 添加到总记录
            rowBatch.merge(eventData);
        }
        dbBatch.setRowBatch(rowBatch);
        input.read(lengthBytes);
        length = ByteUtils.bytes2int(lengthBytes);
        BatchProto.FileBatch filebatchProto = BatchProto.FileBatch.parseFrom(new LimitedInputStream(input, length));
        // 构造原始的model对象
        FileBatch fileBatch = new FileBatch();
        fileBatch.setIdentity(build(filebatchProto.getIdentity()));
        for (BatchProto.FileData fileDataProto : filebatchProto.getFilesList()) {
            FileData fileData = new FileData();
            fileData.setPairId(fileDataProto.getPairId());
            fileData.setTableId(fileDataProto.getTableId());
            fileData.setEventType(EventType.valuesOf(fileDataProto.getEventType()));
            fileData.setLastModifiedTime(fileDataProto.getLastModifiedTime());
            fileData.setNameSpace(fileDataProto.getNamespace());
            fileData.setPath(fileDataProto.getPath());
            fileData.setSize(fileDataProto.getSize());
            // 添加到filebatch中
            fileBatch.getFiles().add(fileData);
        }
        dbBatch.setFileBatch(fileBatch);
        return dbBatch;
    } catch (IOException e) {
        throw new PipeException("deserial_error", e);
    } finally {
        IOUtils.closeQuietly(reader);
    }
}
Also used : EventColumn(com.alibaba.otter.shared.etl.model.EventColumn) ArrayList(java.util.ArrayList) DbBatch(com.alibaba.otter.shared.etl.model.DbBatch) EventData(com.alibaba.otter.shared.etl.model.EventData) BufferedInputStream(java.io.BufferedInputStream) FileData(com.alibaba.otter.shared.etl.model.FileData) FileBatch(com.alibaba.otter.shared.etl.model.FileBatch) BufferedInputStream(java.io.BufferedInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) DataRetriever(com.alibaba.otter.node.etl.common.io.download.DataRetriever) IOException(java.io.IOException) BatchProto(com.alibaba.otter.node.etl.model.protobuf.BatchProto) IOException(java.io.IOException) PipeException(com.alibaba.otter.node.etl.common.pipe.exception.PipeException) FileInputStream(java.io.FileInputStream) Pipeline(com.alibaba.otter.shared.common.model.config.pipeline.Pipeline) RowBatch(com.alibaba.otter.shared.etl.model.RowBatch) PipeException(com.alibaba.otter.node.etl.common.pipe.exception.PipeException) JSONReader(com.alibaba.fastjson.JSONReader) File(java.io.File)

Example 12 with RowBatch

use of com.alibaba.otter.shared.etl.model.RowBatch in project otter by alibaba.

the class DataBatchLoader method load.

public List<LoadContext> load(DbBatch data) {
    final RowBatch rowBatch = data.getRowBatch();
    final FileBatch fileBatch = data.getFileBatch();
    boolean existFileBatch = (rowBatch != null && !CollectionUtils.isEmpty(fileBatch.getFiles()) && data.getRoot() != null);
    boolean existRowBatch = (rowBatch != null && !CollectionUtils.isEmpty(rowBatch.getDatas()));
    int count = 0;
    List<RowBatch> rowBatchs = null;
    if (existRowBatch) {
        // 根据介质内容进行分类合并,每个介质一个载入通道
        rowBatchs = split(rowBatch);
        count += rowBatchs.size();
    }
    if (existFileBatch) {
        count += 1;
    }
    WeightController controller = new WeightController(count);
    List<Future> futures = new ArrayList<Future>();
    ExecutorCompletionService completionService = new ExecutorCompletionService(executorService);
    if (existFileBatch) {
        submitFileBatch(futures, completionService, fileBatch, data.getRoot(), controller);
    }
    if (existRowBatch) {
        submitRowBatch(futures, completionService, rowBatchs, controller);
    }
    // 先获取一下异步处理的结果,记录一下出错的index
    List<LoadContext> processedContexts = new ArrayList<LoadContext>();
    int index = 0;
    LoadException exception = null;
    while (index < futures.size()) {
        try {
            // 它也可能被打断
            Future future = completionService.take();
            future.get();
        } catch (InterruptedException e) {
            exception = new LoadException(e);
            break;
        } catch (ExecutionException e) {
            exception = new LoadException(e);
            break;
        }
        index++;
    }
    // 任何一个线程返回,出现了异常,就退出整个调度
    if (index < futures.size()) {
        // 小于代表有错误,需要对未完成的记录进行cancel操作,对已完成的结果进行收集,做重复录入过滤记录
        for (int errorIndex = 0; errorIndex < futures.size(); errorIndex++) {
            Future future = futures.get(errorIndex);
            if (future.isDone()) {
                try {
                    LoadContext loadContext = (LoadContext) future.get();
                    if (loadContext instanceof DbLoadContext) {
                        // 做一下出错处理,记录到store中
                        dbInterceptor.error((DbLoadContext) loadContext);
                    }
                } catch (InterruptedException e) {
                // ignore
                } catch (ExecutionException e) {
                // ignore
                } catch (Exception e) {
                    logger.error("interceptor process error failed", e);
                }
            } else {
                // 对未完成的进行取消
                future.cancel(true);
            }
        }
    } else {
        for (int i = 0; i < futures.size(); i++) {
            // 收集一下正确处理完成的结果
            Future future = futures.get(i);
            try {
                LoadContext loadContext = (LoadContext) future.get();
                if (loadContext instanceof DbLoadContext) {
                    processedContexts.add((DbLoadContext) loadContext);
                }
            } catch (InterruptedException e) {
            // ignore
            } catch (ExecutionException e) {
            // ignore
            }
        }
    }
    if (exception != null) {
        throw exception;
    } else {
        return processedContexts;
    }
}
Also used : FileBatch(com.alibaba.otter.shared.etl.model.FileBatch) ArrayList(java.util.ArrayList) ExecutorCompletionService(java.util.concurrent.ExecutorCompletionService) LoadException(com.alibaba.otter.node.etl.load.exception.LoadException) BeansException(org.springframework.beans.BeansException) ExecutionException(java.util.concurrent.ExecutionException) LoadException(com.alibaba.otter.node.etl.load.exception.LoadException) RowBatch(com.alibaba.otter.shared.etl.model.RowBatch) DbLoadContext(com.alibaba.otter.node.etl.load.loader.db.context.DbLoadContext) Future(java.util.concurrent.Future) DbLoadContext(com.alibaba.otter.node.etl.load.loader.db.context.DbLoadContext) FileLoadContext(com.alibaba.otter.node.etl.load.loader.db.context.FileLoadContext) LoadContext(com.alibaba.otter.node.etl.load.loader.LoadContext) WeightController(com.alibaba.otter.node.etl.load.loader.weight.WeightController) ExecutionException(java.util.concurrent.ExecutionException)

Example 13 with RowBatch

use of com.alibaba.otter.shared.etl.model.RowBatch in project otter by alibaba.

the class DataBatchLoader method submitRowBatch.

private void submitRowBatch(List<Future> futures, ExecutorCompletionService completionService, final List<RowBatch> rowBatchs, final WeightController controller) {
    for (final RowBatch rowBatch : rowBatchs) {
        // 提交多个并行加载通道
        futures.add(completionService.submit(new Callable<DbLoadContext>() {

            public DbLoadContext call() throws Exception {
                try {
                    MDC.put(OtterConstants.splitPipelineLogFileKey, String.valueOf(rowBatch.getIdentity().getPipelineId()));
                    // dbLoadAction是一个pool池化对象
                    DbLoadAction dbLoadAction = (DbLoadAction) beanFactory.getBean("dbLoadAction", DbLoadAction.class);
                    return dbLoadAction.load(rowBatch, controller);
                } finally {
                    MDC.remove(OtterConstants.splitPipelineLogFileKey);
                }
            }
        }));
    }
}
Also used : RowBatch(com.alibaba.otter.shared.etl.model.RowBatch) Callable(java.util.concurrent.Callable)

Example 14 with RowBatch

use of com.alibaba.otter.shared.etl.model.RowBatch in project otter by alibaba.

the class SelectTask method processSelect.

private void processSelect() {
    while (running) {
        try {
            // 等待ProcessTermin exhaust,会阻塞
            // ProcessTermin发现出现rollback,会立即通知暂停,比分布式permit及时性高
            canStartSelector.get();
            // 判断当前是否为工作节点,S模块不能出现双节点工作,selector容易出现数据错乱
            if (needCheck) {
                checkContinueWork();
            }
            // 出现阻塞挂起时,等待mananger处理完成,解挂开启同步
            // 出现rollback后能及时停住
            arbitrateEventService.toolEvent().waitForPermit(pipelineId);
            // 使用startVersion要解决的一个问题:出现rollback时,尽可能判断取出来的数据是rollback前还是rollback后,想办法丢弃rollback前的数据。
            // (因为出现rollback,之前取出去的几个批次的数据其实是没有执行成功,get取出来的数据会是其后一批数据,如果不丢弃的话,会出现后面的数据先执行,然后又回到出错的点,再执行一遍)
            // int startVersion = rversion.get();
            Message gotMessage = otterSelector.selector();
            // modify by ljh at 2012-09-10,startVersion获取操作应该放在拿到数据之后
            // 放在前面 : (遇到一个并发bug)
            // // a.
            // 先拿startVersion,再获取数据,在拿数据过程中rollback开始并完成了,导致selector返回时数据已经取到了末尾
            // // b. 在进行version判断时发现已经有变化,导致又触发一次拿数据的过程,此时的get
            // cursor已经到队列的末尾,拿不出任何数据,所以出现死等情况
            // 放在后面 : (一点点瑕疵)
            // // a.
            // 并发操作rollback和selector时,针对拿到rollback前的老数据,此时startVersion还未初始化,导致判断不出出现过rollback操作,后面的变更数据会提前同步
            // (概率性会比较高,取决于selector和初始化startVersion的时间间隔)
            int startVersion = rversion.get();
            if (canStartSelector.state() == false) {
                // 是否出现异常
                // 回滚在出现异常的瞬间,拿出来的数据,因为otterSelector.selector()会循环,可能出现了rollback,其还未感知到
                rollback(gotMessage.getId());
                continue;
            }
            if (CollectionUtils.isEmpty(gotMessage.getDatas())) {
                // 处理下空数据,也得更新下游标,可能是回环数据被过滤掉
                // 添加到待响应的buffer列表,不需要await termin信号,因为没启动过s/e/t/l流程
                batchBuffer.put(new BatchTermin(gotMessage.getId(), false));
                continue;
            }
            final EtlEventData etlEventData = arbitrateEventService.selectEvent().await(pipelineId);
            if (rversion.get() != startVersion) {
                // 说明存在过变化,中间出现过rollback,需要丢弃该数据
                logger.warn("rollback happend , should skip this data and get new message.");
                // 确认一下rollback是否完成
                canStartSelector.get();
                // 这时不管有没有数据,都需要执行一次s/e/t/l
                gotMessage = otterSelector.selector();
            }
            final Message message = gotMessage;
            final BatchTermin batchTermin = new BatchTermin(message.getId(), etlEventData.getProcessId());
            // 添加到待响应的buffer列表
            batchBuffer.put(batchTermin);
            Runnable task = new Runnable() {

                public void run() {
                    // 设置profiling信息
                    boolean profiling = isProfiling();
                    Long profilingStartTime = null;
                    if (profiling) {
                        profilingStartTime = System.currentTimeMillis();
                    }
                    MDC.put(OtterConstants.splitPipelineLogFileKey, String.valueOf(pipelineId));
                    String currentName = Thread.currentThread().getName();
                    Thread.currentThread().setName(createTaskName(pipelineId, "SelectWorker"));
                    try {
                        pipeline = configClientService.findPipeline(pipelineId);
                        List<EventData> eventData = message.getDatas();
                        long startTime = etlEventData.getStartTime();
                        if (!CollectionUtils.isEmpty(eventData)) {
                            startTime = eventData.get(0).getExecuteTime();
                        }
                        Channel channel = configClientService.findChannelByPipelineId(pipelineId);
                        RowBatch rowBatch = new RowBatch();
                        // 构造唯一标识
                        Identity identity = new Identity();
                        identity.setChannelId(channel.getId());
                        identity.setPipelineId(pipelineId);
                        identity.setProcessId(etlEventData.getProcessId());
                        rowBatch.setIdentity(identity);
                        // 进行数据合并
                        for (EventData data : eventData) {
                            rowBatch.merge(data);
                        }
                        long nextNodeId = etlEventData.getNextNid();
                        List<PipeKey> pipeKeys = rowDataPipeDelegate.put(new DbBatch(rowBatch), nextNodeId);
                        etlEventData.setDesc(pipeKeys);
                        etlEventData.setNumber((long) eventData.size());
                        // 使用原始数据的第一条
                        etlEventData.setFirstTime(startTime);
                        etlEventData.setBatchId(message.getId());
                        if (profiling) {
                            Long profilingEndTime = System.currentTimeMillis();
                            stageAggregationCollector.push(pipelineId, StageType.SELECT, new AggregationItem(profilingStartTime, profilingEndTime));
                        }
                        arbitrateEventService.selectEvent().single(etlEventData);
                    } catch (Throwable e) {
                        if (!isInterrupt(e)) {
                            logger.error(String.format("[%s] selectwork executor is error! data:%s", pipelineId, etlEventData), e);
                            sendRollbackTermin(pipelineId, e);
                        } else {
                            logger.info(String.format("[%s] selectwork executor is interrrupt! data:%s", pipelineId, etlEventData), e);
                        }
                    } finally {
                        Thread.currentThread().setName(currentName);
                        MDC.remove(OtterConstants.splitPipelineLogFileKey);
                    }
                }
            };
            // 构造pending任务,可在关闭线程时退出任务
            SetlFuture extractFuture = new SetlFuture(StageType.SELECT, etlEventData.getProcessId(), pendingFuture, task);
            executorService.execute(extractFuture);
        } catch (Throwable e) {
            if (!isInterrupt(e)) {
                logger.error(String.format("[%s] selectTask is error!", pipelineId), e);
                sendRollbackTermin(pipelineId, e);
            } else {
                logger.info(String.format("[%s] selectTask is interrrupt!", pipelineId), e);
                return;
            }
        }
    }
}
Also used : Message(com.alibaba.otter.node.etl.select.selector.Message) Channel(com.alibaba.otter.shared.common.model.config.channel.Channel) PipeKey(com.alibaba.otter.node.etl.common.pipe.PipeKey) TerminEventData(com.alibaba.otter.shared.arbitrate.model.TerminEventData) EtlEventData(com.alibaba.otter.shared.arbitrate.model.EtlEventData) EventData(com.alibaba.otter.shared.etl.model.EventData) DbBatch(com.alibaba.otter.shared.etl.model.DbBatch) EtlEventData(com.alibaba.otter.shared.arbitrate.model.EtlEventData) RowBatch(com.alibaba.otter.shared.etl.model.RowBatch) AggregationItem(com.alibaba.otter.node.etl.common.jmx.StageAggregation.AggregationItem) Identity(com.alibaba.otter.shared.etl.model.Identity) SetlFuture(com.alibaba.otter.node.etl.extract.SetlFuture)

Example 15 with RowBatch

use of com.alibaba.otter.shared.etl.model.RowBatch in project otter by alibaba.

the class OtterTransformerTest method test_rowData_oracle_mysql.

@Test
public void test_rowData_oracle_mysql() {
    final Pipeline pipeline = new Pipeline();
    pipeline.setId(100L);
    List<DataMediaPair> pairs = new ArrayList<DataMediaPair>();
    DataMediaPair pair1 = new DataMediaPair();
    pair1.setId(1L);
    pair1.setPipelineId(pipeline.getId());
    pair1.setPullWeight(1L);
    pair1.setPushWeight(1L);
    DbDataMedia oracleMedia = getOracleMedia();
    oracleMedia.setId(1L);
    pair1.setSource(oracleMedia);
    DbDataMedia mysqlMedia = getMysqlMedia();
    pair1.setTarget(mysqlMedia);
    pairs.add(pair1);
    pipeline.setPairs(pairs);
    PipelineParameter param = new PipelineParameter();
    param.setSyncMode(SyncMode.ROW);
    pipeline.setParameters(param);
    new NonStrictExpectations() {

        {
            configClientService.findPipeline(anyLong);
            returns(pipeline);
        }
    };
    Identity identity = new Identity();
    identity.setChannelId(100L);
    identity.setPipelineId(100L);
    identity.setProcessId(100L);
    RowBatch rowBatch = new RowBatch();
    rowBatch.setIdentity(identity);
    EventData eventData = new EventData();
    eventData.setTableId(1L);
    eventData.setSchemaName("srf");
    eventData.setTableName("columns");
    eventData.setEventType(EventType.UPDATE);
    eventData.setExecuteTime(100L);
    eventData.getKeys().add(buildColumn("id", Types.NUMERIC, "1", true, false));
    eventData.getKeys().add(buildColumn("name", Types.VARCHAR, "ljh", true, false));
    eventData.getColumns().add(buildColumn("alias_name", Types.CHAR, "hello", false, false));
    eventData.getColumns().add(buildColumn("amount", Types.NUMERIC, "100.01", false, false));
    eventData.getColumns().add(buildColumn("text_b", Types.BLOB, "[116,101,120,116,95,98]", false, false));
    eventData.getColumns().add(buildColumn("text_c", Types.CLOB, "text_c", false, false));
    eventData.getColumns().add(buildColumn("curr_date", Types.DATE, "2011-01-01", false, false));
    eventData.getColumns().add(buildColumn("gmt_create", Types.DATE, "2011-01-01 11:11:11", false, false));
    eventData.getColumns().add(buildColumn("gmt_modify", Types.DATE, "2011-01-01 11:11:11", false, false));
    rowBatch.merge(eventData);
    Map<Class, BatchObject> batchs = otterTransformFactory.transform(rowBatch);
    RowBatch result = (RowBatch) batchs.get(EventData.class);
    want.number(result.getDatas().size()).isEqualTo(1);
}
Also used : DataMediaPair(com.alibaba.otter.shared.common.model.config.data.DataMediaPair) ArrayList(java.util.ArrayList) EventData(com.alibaba.otter.shared.etl.model.EventData) Pipeline(com.alibaba.otter.shared.common.model.config.pipeline.Pipeline) RowBatch(com.alibaba.otter.shared.etl.model.RowBatch) BatchObject(com.alibaba.otter.shared.etl.model.BatchObject) PipelineParameter(com.alibaba.otter.shared.common.model.config.pipeline.PipelineParameter) Identity(com.alibaba.otter.shared.etl.model.Identity) DbDataMedia(com.alibaba.otter.shared.common.model.config.data.db.DbDataMedia) Test(org.testng.annotations.Test) BaseDbTest(com.alibaba.otter.node.etl.BaseDbTest)

Aggregations

RowBatch (com.alibaba.otter.shared.etl.model.RowBatch)23 EventData (com.alibaba.otter.shared.etl.model.EventData)14 Test (org.testng.annotations.Test)13 Pipeline (com.alibaba.otter.shared.common.model.config.pipeline.Pipeline)12 DbBatch (com.alibaba.otter.shared.etl.model.DbBatch)12 Identity (com.alibaba.otter.shared.etl.model.Identity)12 DataMediaPair (com.alibaba.otter.shared.common.model.config.data.DataMediaPair)9 BaseDbTest (com.alibaba.otter.node.etl.BaseDbTest)8 FileBatch (com.alibaba.otter.shared.etl.model.FileBatch)7 BaseOtterTest (com.alibaba.otter.node.etl.BaseOtterTest)5 FileData (com.alibaba.otter.shared.etl.model.FileData)5 ArrayList (java.util.ArrayList)5 WeightController (com.alibaba.otter.node.etl.load.loader.weight.WeightController)3 Channel (com.alibaba.otter.shared.common.model.config.channel.Channel)3 Node (com.alibaba.otter.shared.common.model.config.node.Node)3 BatchObject (com.alibaba.otter.shared.etl.model.BatchObject)3 EventColumn (com.alibaba.otter.shared.etl.model.EventColumn)3 File (java.io.File)3 NodeCommmunicationClient (com.alibaba.otter.node.common.communication.NodeCommmunicationClient)2 PipeException (com.alibaba.otter.node.etl.common.pipe.exception.PipeException)2