use of com.alibaba.otter.shared.etl.model.EventData in project otter by alibaba.
the class ViewExtractor method extract.
@Override
public void extract(DbBatch dbBatch) throws ExtractException {
Assert.notNull(dbBatch);
Assert.notNull(dbBatch.getRowBatch());
Pipeline pipeline = getPipeline(dbBatch.getRowBatch().getIdentity().getPipelineId());
List<DataMediaPair> dataMediaPairs = pipeline.getPairs();
/**
* Key = TableId<br>
* Value = a List of this tableId's column need to sync<br>
*/
Map<Long, List<ColumnPair>> viewColumnPairs = new HashMap<Long, List<ColumnPair>>();
Map<Long, ColumnPairMode> viewColumnPairModes = new HashMap<Long, ColumnPairMode>();
for (DataMediaPair dataMediaPair : dataMediaPairs) {
List<ColumnPair> columnPairs = dataMediaPair.getColumnPairs();
// 设置ColumnPairMode
viewColumnPairModes.put(dataMediaPair.getSource().getId(), dataMediaPair.getColumnPairMode());
// 如果没有columnPairs,则默认全字段同步,不做处理
if (!CollectionUtils.isEmpty(columnPairs)) {
viewColumnPairs.put(dataMediaPair.getSource().getId(), columnPairs);
}
}
List<EventData> eventDatas = dbBatch.getRowBatch().getDatas();
// 使用set,提升remove时的查找速度
Set<EventData> removeDatas = new HashSet<EventData>();
for (EventData eventData : eventDatas) {
if (eventData.getEventType().isDdl()) {
continue;
}
List<ColumnPair> columns = viewColumnPairs.get(eventData.getTableId());
if (!CollectionUtils.isEmpty(columns)) {
// 组装需要同步的Column
ColumnPairMode mode = viewColumnPairModes.get(eventData.getTableId());
eventData.setColumns(columnFilter(eventData.getColumns(), columns, mode));
eventData.setKeys(columnFilter(eventData.getKeys(), columns, mode));
if (!CollectionUtils.isEmpty(eventData.getOldKeys())) {
eventData.setOldKeys(columnFilter(eventData.getOldKeys(), columns, mode));
}
if (CollectionUtils.isEmpty(eventData.getKeys())) {
// 无主键,报错
throw new ExtractException(String.format("eventData after viewExtractor has no pks , pls check! identity:%s, new eventData:%s", dbBatch.getRowBatch().getIdentity().toString(), eventData.toString()));
}
// update: 过滤后如果无字段(变更需要同步)和主键变更,则可以忽略之,避免sql语法错误
if (eventData.getEventType().isUpdate() && (CollectionUtils.isEmpty(eventData.getColumns()) || CollectionUtils.isEmpty(eventData.getUpdatedColumns())) && CollectionUtils.isEmpty(eventData.getOldKeys())) {
// 过滤之后无字段需要同步,并且不存在主键变更同步,则忽略该记录
removeDatas.add(eventData);
}
}
}
if (!CollectionUtils.isEmpty(removeDatas)) {
eventDatas.removeAll(removeDatas);
}
}
use of com.alibaba.otter.shared.etl.model.EventData in project otter by alibaba.
the class RowDataHttpPipe method getDbBatch.
// 处理对应的dbBatch
private DbBatch getDbBatch(HttpPipeKey key) {
String dataUrl = key.getUrl();
Pipeline pipeline = configClientService.findPipeline(key.getIdentity().getPipelineId());
DataRetriever dataRetriever = dataRetrieverFactory.createRetriever(pipeline.getParameters().getRetriever(), dataUrl, downloadDir);
File archiveFile = null;
try {
dataRetriever.connect();
dataRetriever.doRetrieve();
archiveFile = dataRetriever.getDataAsFile();
} catch (Exception e) {
dataRetriever.abort();
throw new PipeException("download_error", e);
} finally {
dataRetriever.disconnect();
}
// 处理下有加密的数据
if (StringUtils.isNotEmpty(key.getKey()) && StringUtils.isNotEmpty(key.getCrc())) {
decodeFile(archiveFile, key.getKey(), key.getCrc());
}
InputStream input = null;
JSONReader reader = null;
try {
input = new BufferedInputStream(new FileInputStream(archiveFile));
DbBatch dbBatch = new DbBatch();
byte[] lengthBytes = new byte[4];
input.read(lengthBytes);
int length = ByteUtils.bytes2int(lengthBytes);
BatchProto.RowBatch rowbatchProto = BatchProto.RowBatch.parseFrom(new LimitedInputStream(input, length));
// 构造原始的model对象
RowBatch rowBatch = new RowBatch();
rowBatch.setIdentity(build(rowbatchProto.getIdentity()));
for (BatchProto.RowData rowDataProto : rowbatchProto.getRowsList()) {
EventData eventData = new EventData();
eventData.setPairId(rowDataProto.getPairId());
eventData.setTableId(rowDataProto.getTableId());
eventData.setTableName(rowDataProto.getTableName());
eventData.setSchemaName(rowDataProto.getSchemaName());
eventData.setEventType(EventType.valuesOf(rowDataProto.getEventType()));
eventData.setExecuteTime(rowDataProto.getExecuteTime());
// add by ljh at 2012-10-31
if (StringUtils.isNotEmpty(rowDataProto.getSyncMode())) {
eventData.setSyncMode(SyncMode.valuesOf(rowDataProto.getSyncMode()));
}
if (StringUtils.isNotEmpty(rowDataProto.getSyncConsistency())) {
eventData.setSyncConsistency(SyncConsistency.valuesOf(rowDataProto.getSyncConsistency()));
}
// 处理主键
List<EventColumn> keys = new ArrayList<EventColumn>();
for (BatchProto.Column columnProto : rowDataProto.getKeysList()) {
keys.add(buildColumn(columnProto));
}
eventData.setKeys(keys);
// 处理old主键
if (CollectionUtils.isEmpty(rowDataProto.getOldKeysList()) == false) {
List<EventColumn> oldKeys = new ArrayList<EventColumn>();
for (BatchProto.Column columnProto : rowDataProto.getOldKeysList()) {
oldKeys.add(buildColumn(columnProto));
}
eventData.setOldKeys(oldKeys);
}
// 处理具体的column value
List<EventColumn> columns = new ArrayList<EventColumn>();
for (BatchProto.Column columnProto : rowDataProto.getColumnsList()) {
columns.add(buildColumn(columnProto));
}
eventData.setColumns(columns);
eventData.setRemedy(rowDataProto.getRemedy());
eventData.setSize(rowDataProto.getSize());
eventData.setSql(rowDataProto.getSql());
eventData.setDdlSchemaName(rowDataProto.getDdlSchemaName());
eventData.setHint(rowDataProto.getHint());
eventData.setWithoutSchema(rowDataProto.getWithoutSchema());
// 添加到总记录
rowBatch.merge(eventData);
}
dbBatch.setRowBatch(rowBatch);
input.read(lengthBytes);
length = ByteUtils.bytes2int(lengthBytes);
BatchProto.FileBatch filebatchProto = BatchProto.FileBatch.parseFrom(new LimitedInputStream(input, length));
// 构造原始的model对象
FileBatch fileBatch = new FileBatch();
fileBatch.setIdentity(build(filebatchProto.getIdentity()));
for (BatchProto.FileData fileDataProto : filebatchProto.getFilesList()) {
FileData fileData = new FileData();
fileData.setPairId(fileDataProto.getPairId());
fileData.setTableId(fileDataProto.getTableId());
fileData.setEventType(EventType.valuesOf(fileDataProto.getEventType()));
fileData.setLastModifiedTime(fileDataProto.getLastModifiedTime());
fileData.setNameSpace(fileDataProto.getNamespace());
fileData.setPath(fileDataProto.getPath());
fileData.setSize(fileDataProto.getSize());
// 添加到filebatch中
fileBatch.getFiles().add(fileData);
}
dbBatch.setFileBatch(fileBatch);
return dbBatch;
} catch (IOException e) {
throw new PipeException("deserial_error", e);
} finally {
IOUtils.closeQuietly(reader);
}
}
use of com.alibaba.otter.shared.etl.model.EventData in project otter by alibaba.
the class DbLoadAction method doLoad.
private void doLoad(final DbLoadContext context, DbLoadData loadData) {
// 优先处理delete,可以利用batch优化
List<List<EventData>> batchDatas = new ArrayList<List<EventData>>();
for (TableLoadData tableData : loadData.getTables()) {
if (useBatch) {
// 优先执行delete语句,针对uniqe更新,一般会进行delete + insert的处理模式,避免并发更新
batchDatas.addAll(split(tableData.getDeleteDatas()));
} else {
// 优先执行delete语句,针对uniqe更新,一般会进行delete + insert的处理模式,避免并发更新
for (EventData data : tableData.getDeleteDatas()) {
batchDatas.add(Arrays.asList(data));
}
}
}
if (context.getPipeline().getParameters().isDryRun()) {
doDryRun(context, batchDatas, true);
} else {
doTwoPhase(context, batchDatas, true);
}
batchDatas.clear();
// 处理下insert/update
for (TableLoadData tableData : loadData.getTables()) {
if (useBatch) {
// 执行insert + update语句
batchDatas.addAll(split(tableData.getInsertDatas()));
// 每条记录分为一组,并行加载
batchDatas.addAll(split(tableData.getUpadateDatas()));
} else {
// 执行insert + update语句
for (EventData data : tableData.getInsertDatas()) {
batchDatas.add(Arrays.asList(data));
}
for (EventData data : tableData.getUpadateDatas()) {
batchDatas.add(Arrays.asList(data));
}
}
}
if (context.getPipeline().getParameters().isDryRun()) {
doDryRun(context, batchDatas, true);
} else {
doTwoPhase(context, batchDatas, true);
}
batchDatas.clear();
}
use of com.alibaba.otter.shared.etl.model.EventData in project otter by alibaba.
the class DbLoadAction method doTwoPhase.
/**
* 首先进行并行执行,出错后转为串行执行
*/
private void doTwoPhase(DbLoadContext context, List<List<EventData>> totalRows, boolean canBatch) {
// 预处理下数据
List<Future<Exception>> results = new ArrayList<Future<Exception>>();
for (List<EventData> rows : totalRows) {
if (CollectionUtils.isEmpty(rows)) {
// 过滤空记录
continue;
}
results.add(executor.submit(new DbLoadWorker(context, rows, canBatch)));
}
boolean partFailed = false;
for (int i = 0; i < results.size(); i++) {
Future<Exception> result = results.get(i);
Exception ex = null;
try {
ex = result.get();
for (EventData data : totalRows.get(i)) {
// 通知加载完成
interceptor.after(context, data);
}
} catch (Exception e) {
ex = e;
}
if (ex != null) {
logger.warn("##load phase one failed!", ex);
partFailed = true;
}
}
if (true == partFailed) {
// if (CollectionUtils.isEmpty(context.getFailedDatas())) {
// logger.error("##load phase one failed but failedDatas is empty!");
// return;
// }
// 尝试的内容换成phase one跑的所有数据,避免因failed datas计算错误而导致丢数据
List<EventData> retryEventDatas = new ArrayList<EventData>();
for (List<EventData> rows : totalRows) {
retryEventDatas.addAll(rows);
}
// 清理failed data数据
context.getFailedDatas().clear();
// 可能为null,manager老版本数据序列化传输时,因为数据库中没有skipLoadException变量配置
Boolean skipLoadException = context.getPipeline().getParameters().getSkipLoadException();
if (skipLoadException != null && skipLoadException) {
// 如果设置为允许跳过单条异常,则一条条执行数据load,准确过滤掉出错的记录,并进行日志记录
for (EventData retryEventData : retryEventDatas) {
// 强制设置batch为false
DbLoadWorker worker = new DbLoadWorker(context, Arrays.asList(retryEventData), false);
try {
Exception ex = worker.call();
if (ex != null) {
// do skip
logger.warn("skip exception for data : {} , caused by {}", retryEventData, ExceptionUtils.getFullStackTrace(ex));
}
} catch (Exception ex) {
// do skip
logger.warn("skip exception for data : {} , caused by {}", retryEventData, ExceptionUtils.getFullStackTrace(ex));
}
}
} else {
// 直接一批进行处理,减少线程调度
// 强制设置batch为false
DbLoadWorker worker = new DbLoadWorker(context, retryEventDatas, false);
try {
Exception ex = worker.call();
if (ex != null) {
// 自己抛自己接
throw ex;
}
} catch (Exception ex) {
logger.error("##load phase two failed!", ex);
throw new LoadException(ex);
}
}
// 清理failed data数据
for (EventData data : retryEventDatas) {
// 通知加载完成
interceptor.after(context, data);
}
}
}
use of com.alibaba.otter.shared.etl.model.EventData in project otter by alibaba.
the class SelectTask method processSelect.
private void processSelect() {
while (running) {
try {
// 等待ProcessTermin exhaust,会阻塞
// ProcessTermin发现出现rollback,会立即通知暂停,比分布式permit及时性高
canStartSelector.get();
// 判断当前是否为工作节点,S模块不能出现双节点工作,selector容易出现数据错乱
if (needCheck) {
checkContinueWork();
}
// 出现阻塞挂起时,等待mananger处理完成,解挂开启同步
// 出现rollback后能及时停住
arbitrateEventService.toolEvent().waitForPermit(pipelineId);
// 使用startVersion要解决的一个问题:出现rollback时,尽可能判断取出来的数据是rollback前还是rollback后,想办法丢弃rollback前的数据。
// (因为出现rollback,之前取出去的几个批次的数据其实是没有执行成功,get取出来的数据会是其后一批数据,如果不丢弃的话,会出现后面的数据先执行,然后又回到出错的点,再执行一遍)
// int startVersion = rversion.get();
Message gotMessage = otterSelector.selector();
// modify by ljh at 2012-09-10,startVersion获取操作应该放在拿到数据之后
// 放在前面 : (遇到一个并发bug)
// // a.
// 先拿startVersion,再获取数据,在拿数据过程中rollback开始并完成了,导致selector返回时数据已经取到了末尾
// // b. 在进行version判断时发现已经有变化,导致又触发一次拿数据的过程,此时的get
// cursor已经到队列的末尾,拿不出任何数据,所以出现死等情况
// 放在后面 : (一点点瑕疵)
// // a.
// 并发操作rollback和selector时,针对拿到rollback前的老数据,此时startVersion还未初始化,导致判断不出出现过rollback操作,后面的变更数据会提前同步
// (概率性会比较高,取决于selector和初始化startVersion的时间间隔)
int startVersion = rversion.get();
if (canStartSelector.state() == false) {
// 是否出现异常
// 回滚在出现异常的瞬间,拿出来的数据,因为otterSelector.selector()会循环,可能出现了rollback,其还未感知到
rollback(gotMessage.getId());
continue;
}
if (CollectionUtils.isEmpty(gotMessage.getDatas())) {
// 处理下空数据,也得更新下游标,可能是回环数据被过滤掉
// 添加到待响应的buffer列表,不需要await termin信号,因为没启动过s/e/t/l流程
batchBuffer.put(new BatchTermin(gotMessage.getId(), false));
continue;
}
final EtlEventData etlEventData = arbitrateEventService.selectEvent().await(pipelineId);
if (rversion.get() != startVersion) {
// 说明存在过变化,中间出现过rollback,需要丢弃该数据
logger.warn("rollback happend , should skip this data and get new message.");
// 确认一下rollback是否完成
canStartSelector.get();
// 这时不管有没有数据,都需要执行一次s/e/t/l
gotMessage = otterSelector.selector();
}
final Message message = gotMessage;
final BatchTermin batchTermin = new BatchTermin(message.getId(), etlEventData.getProcessId());
// 添加到待响应的buffer列表
batchBuffer.put(batchTermin);
Runnable task = new Runnable() {
public void run() {
// 设置profiling信息
boolean profiling = isProfiling();
Long profilingStartTime = null;
if (profiling) {
profilingStartTime = System.currentTimeMillis();
}
MDC.put(OtterConstants.splitPipelineLogFileKey, String.valueOf(pipelineId));
String currentName = Thread.currentThread().getName();
Thread.currentThread().setName(createTaskName(pipelineId, "SelectWorker"));
try {
pipeline = configClientService.findPipeline(pipelineId);
List<EventData> eventData = message.getDatas();
long startTime = etlEventData.getStartTime();
if (!CollectionUtils.isEmpty(eventData)) {
startTime = eventData.get(0).getExecuteTime();
}
Channel channel = configClientService.findChannelByPipelineId(pipelineId);
RowBatch rowBatch = new RowBatch();
// 构造唯一标识
Identity identity = new Identity();
identity.setChannelId(channel.getId());
identity.setPipelineId(pipelineId);
identity.setProcessId(etlEventData.getProcessId());
rowBatch.setIdentity(identity);
// 进行数据合并
for (EventData data : eventData) {
rowBatch.merge(data);
}
long nextNodeId = etlEventData.getNextNid();
List<PipeKey> pipeKeys = rowDataPipeDelegate.put(new DbBatch(rowBatch), nextNodeId);
etlEventData.setDesc(pipeKeys);
etlEventData.setNumber((long) eventData.size());
// 使用原始数据的第一条
etlEventData.setFirstTime(startTime);
etlEventData.setBatchId(message.getId());
if (profiling) {
Long profilingEndTime = System.currentTimeMillis();
stageAggregationCollector.push(pipelineId, StageType.SELECT, new AggregationItem(profilingStartTime, profilingEndTime));
}
arbitrateEventService.selectEvent().single(etlEventData);
} catch (Throwable e) {
if (!isInterrupt(e)) {
logger.error(String.format("[%s] selectwork executor is error! data:%s", pipelineId, etlEventData), e);
sendRollbackTermin(pipelineId, e);
} else {
logger.info(String.format("[%s] selectwork executor is interrrupt! data:%s", pipelineId, etlEventData), e);
}
} finally {
Thread.currentThread().setName(currentName);
MDC.remove(OtterConstants.splitPipelineLogFileKey);
}
}
};
// 构造pending任务,可在关闭线程时退出任务
SetlFuture extractFuture = new SetlFuture(StageType.SELECT, etlEventData.getProcessId(), pendingFuture, task);
executorService.execute(extractFuture);
} catch (Throwable e) {
if (!isInterrupt(e)) {
logger.error(String.format("[%s] selectTask is error!", pipelineId), e);
sendRollbackTermin(pipelineId, e);
} else {
logger.info(String.format("[%s] selectTask is interrrupt!", pipelineId), e);
return;
}
}
}
}
Aggregations