Search in sources :

Example 36 with Configuration

use of com.alibaba.datax.common.util.Configuration in project DataX by alibaba.

the class UnstructuredStorageReaderUtil method validateColumn.

public static void validateColumn(Configuration readerConfiguration) {
    // column: 1. index type 2.value type 3.when type is Date, may have
    // format
    List<Configuration> columns = readerConfiguration.getListConfiguration(com.alibaba.datax.plugin.unstructuredstorage.reader.Key.COLUMN);
    if (null == columns || columns.size() == 0) {
        throw DataXException.asDataXException(UnstructuredStorageReaderErrorCode.REQUIRED_VALUE, "您需要指定 columns");
    }
    // handle ["*"]
    if (null != columns && 1 == columns.size()) {
        String columnsInStr = columns.get(0).toString();
        if ("\"*\"".equals(columnsInStr) || "'*'".equals(columnsInStr)) {
            readerConfiguration.set(com.alibaba.datax.plugin.unstructuredstorage.reader.Key.COLUMN, null);
            columns = null;
        }
    }
    if (null != columns && columns.size() != 0) {
        for (Configuration eachColumnConf : columns) {
            eachColumnConf.getNecessaryValue(com.alibaba.datax.plugin.unstructuredstorage.reader.Key.TYPE, UnstructuredStorageReaderErrorCode.REQUIRED_VALUE);
            Integer columnIndex = eachColumnConf.getInt(com.alibaba.datax.plugin.unstructuredstorage.reader.Key.INDEX);
            String columnValue = eachColumnConf.getString(com.alibaba.datax.plugin.unstructuredstorage.reader.Key.VALUE);
            if (null == columnIndex && null == columnValue) {
                throw DataXException.asDataXException(UnstructuredStorageReaderErrorCode.NO_INDEX_VALUE, "由于您配置了type, 则至少需要配置 index 或 value");
            }
            if (null != columnIndex && null != columnValue) {
                throw DataXException.asDataXException(UnstructuredStorageReaderErrorCode.MIXED_INDEX_VALUE, "您混合配置了index, value, 每一列同时仅能选择其中一种");
            }
            if (null != columnIndex && columnIndex < 0) {
                throw DataXException.asDataXException(UnstructuredStorageReaderErrorCode.ILLEGAL_VALUE, String.format("index需要大于等于0, 您配置的index为[%s]", columnIndex));
            }
        }
    }
}
Also used : Configuration(com.alibaba.datax.common.util.Configuration)

Example 37 with Configuration

use of com.alibaba.datax.common.util.Configuration in project DataX by alibaba.

the class OriginalConfPretreatmentUtil method dealJdbcAndTable.

private static void dealJdbcAndTable(Configuration originalConfig) {
    String username = originalConfig.getString(Key.USERNAME);
    String password = originalConfig.getString(Key.PASSWORD);
    boolean checkSlave = originalConfig.getBool(Key.CHECK_SLAVE, false);
    boolean isTableMode = originalConfig.getBool(Constant.IS_TABLE_MODE);
    boolean isPreCheck = originalConfig.getBool(Key.DRYRUN, false);
    List<Object> conns = originalConfig.getList(Constant.CONN_MARK, Object.class);
    List<String> preSql = originalConfig.getList(Key.PRE_SQL, String.class);
    int tableNum = 0;
    for (int i = 0, len = conns.size(); i < len; i++) {
        Configuration connConf = Configuration.from(conns.get(i).toString());
        connConf.getNecessaryValue(Key.JDBC_URL, DBUtilErrorCode.REQUIRED_VALUE);
        List<String> jdbcUrls = connConf.getList(Key.JDBC_URL, String.class);
        String jdbcUrl;
        if (isPreCheck) {
            jdbcUrl = DBUtil.chooseJdbcUrlWithoutRetry(DATABASE_TYPE, jdbcUrls, username, password, preSql, checkSlave);
        } else {
            jdbcUrl = DBUtil.chooseJdbcUrl(DATABASE_TYPE, jdbcUrls, username, password, preSql, checkSlave);
        }
        jdbcUrl = DATABASE_TYPE.appendJDBCSuffixForReader(jdbcUrl);
        // 回写到connection[i].jdbcUrl
        originalConfig.set(String.format("%s[%d].%s", Constant.CONN_MARK, i, Key.JDBC_URL), jdbcUrl);
        LOG.info("Available jdbcUrl:{}.", jdbcUrl);
        if (isTableMode) {
            // table 方式
            // 对每一个connection 上配置的table 项进行解析(已对表名称进行了 ` 处理的)
            List<String> tables = connConf.getList(Key.TABLE, String.class);
            List<String> expandedTables = TableExpandUtil.expandTableConf(DATABASE_TYPE, tables);
            if (null == expandedTables || expandedTables.isEmpty()) {
                throw DataXException.asDataXException(DBUtilErrorCode.ILLEGAL_VALUE, String.format("您所配置的读取数据库表:%s 不正确. 因为DataX根据您的配置找不到这张表. 请检查您的配置并作出修改." + "请先了解 DataX 配置.", StringUtils.join(tables, ",")));
            }
            tableNum += expandedTables.size();
            originalConfig.set(String.format("%s[%d].%s", Constant.CONN_MARK, i, Key.TABLE), expandedTables);
        } else {
        // 说明是配置的 querySql 方式,不做处理.
        }
    }
    originalConfig.set(Constant.TABLE_NUMBER_MARK, tableNum);
}
Also used : Configuration(com.alibaba.datax.common.util.Configuration)

Example 38 with Configuration

use of com.alibaba.datax.common.util.Configuration in project DataX by alibaba.

the class HdfsHelper method getColumnTypeInspectors.

/**
     * 根据writer配置的字段类型,构建inspector
     * @param columns
     * @return
     */
public List<ObjectInspector> getColumnTypeInspectors(List<Configuration> columns) {
    List<ObjectInspector> columnTypeInspectors = Lists.newArrayList();
    for (Configuration eachColumnConf : columns) {
        SupportHiveDataType columnType = SupportHiveDataType.valueOf(eachColumnConf.getString(Key.TYPE).toUpperCase());
        ObjectInspector objectInspector = null;
        switch(columnType) {
            case TINYINT:
                objectInspector = ObjectInspectorFactory.getReflectionObjectInspector(Byte.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
                break;
            case SMALLINT:
                objectInspector = ObjectInspectorFactory.getReflectionObjectInspector(Short.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
                break;
            case INT:
                objectInspector = ObjectInspectorFactory.getReflectionObjectInspector(Integer.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
                break;
            case BIGINT:
                objectInspector = ObjectInspectorFactory.getReflectionObjectInspector(Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
                break;
            case FLOAT:
                objectInspector = ObjectInspectorFactory.getReflectionObjectInspector(Float.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
                break;
            case DOUBLE:
                objectInspector = ObjectInspectorFactory.getReflectionObjectInspector(Double.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
                break;
            case TIMESTAMP:
                objectInspector = ObjectInspectorFactory.getReflectionObjectInspector(java.sql.Timestamp.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
                break;
            case DATE:
                objectInspector = ObjectInspectorFactory.getReflectionObjectInspector(java.sql.Date.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
                break;
            case STRING:
            case VARCHAR:
            case CHAR:
                objectInspector = ObjectInspectorFactory.getReflectionObjectInspector(String.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
                break;
            case BOOLEAN:
                objectInspector = ObjectInspectorFactory.getReflectionObjectInspector(Boolean.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
                break;
            default:
                throw DataXException.asDataXException(HdfsWriterErrorCode.ILLEGAL_VALUE, String.format("您的配置文件中的列配置信息有误. 因为DataX 不支持数据库写入这种字段类型. 字段名:[%s], 字段类型:[%d]. 请修改表中该字段的类型或者不同步该字段.", eachColumnConf.getString(Key.NAME), eachColumnConf.getString(Key.TYPE)));
        }
        columnTypeInspectors.add(objectInspector);
    }
    return columnTypeInspectors;
}
Also used : ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Configuration(com.alibaba.datax.common.util.Configuration)

Example 39 with Configuration

use of com.alibaba.datax.common.util.Configuration in project DataX by alibaba.

the class ReaderSplitUtil method doSplit.

public static List<Configuration> doSplit(Configuration originalSliceConfig, int adviceNumber) {
    boolean isTableMode = originalSliceConfig.getBool(Constant.IS_TABLE_MODE).booleanValue();
    int eachTableShouldSplittedNumber = -1;
    if (isTableMode) {
        // adviceNumber这里是channel数量大小, 即datax并发task数量
        // eachTableShouldSplittedNumber是单表应该切分的份数, 向上取整可能和adviceNumber没有比例关系了已经
        eachTableShouldSplittedNumber = calculateEachTableShouldSplittedNumber(adviceNumber, originalSliceConfig.getInt(Constant.TABLE_NUMBER_MARK));
    }
    String column = originalSliceConfig.getString(Key.COLUMN);
    String where = originalSliceConfig.getString(Key.WHERE, null);
    List<Object> conns = originalSliceConfig.getList(Constant.CONN_MARK, Object.class);
    List<Configuration> splittedConfigs = new ArrayList<Configuration>();
    for (int i = 0, len = conns.size(); i < len; i++) {
        Configuration sliceConfig = originalSliceConfig.clone();
        Configuration connConf = Configuration.from(conns.get(i).toString());
        String jdbcUrl = connConf.getString(Key.JDBC_URL);
        sliceConfig.set(Key.JDBC_URL, jdbcUrl);
        // 抽取 jdbcUrl 中的 ip/port 进行资源使用的打标,以提供给 core 做有意义的 shuffle 操作
        sliceConfig.set(CommonConstant.LOAD_BALANCE_RESOURCE_MARK, DataBaseType.parseIpFromJdbcUrl(jdbcUrl));
        sliceConfig.remove(Constant.CONN_MARK);
        Configuration tempSlice;
        // 说明是配置的 table 方式
        if (isTableMode) {
            // 已在之前进行了扩展和`处理,可以直接使用
            List<String> tables = connConf.getList(Key.TABLE, String.class);
            Validate.isTrue(null != tables && !tables.isEmpty(), "您读取数据库表配置错误.");
            String splitPk = originalSliceConfig.getString(Key.SPLIT_PK, null);
            //最终切分份数不一定等于 eachTableShouldSplittedNumber
            boolean needSplitTable = eachTableShouldSplittedNumber > 1 && StringUtils.isNotBlank(splitPk);
            if (needSplitTable) {
                if (tables.size() == 1) {
                    //原来:如果是单表的,主键切分num=num*2+1
                    // splitPk is null这类的情况的数据量本身就比真实数据量少很多, 和channel大小比率关系时,不建议考虑
                    //eachTableShouldSplittedNumber = eachTableShouldSplittedNumber * 2 + 1;// 不应该加1导致长尾
                    //考虑其他比率数字?(splitPk is null, 忽略此长尾)
                    eachTableShouldSplittedNumber = eachTableShouldSplittedNumber * 5;
                }
                // 尝试对每个表,切分为eachTableShouldSplittedNumber 份
                for (String table : tables) {
                    tempSlice = sliceConfig.clone();
                    tempSlice.set(Key.TABLE, table);
                    List<Configuration> splittedSlices = SingleTableSplitUtil.splitSingleTable(tempSlice, eachTableShouldSplittedNumber);
                    splittedConfigs.addAll(splittedSlices);
                }
            } else {
                for (String table : tables) {
                    tempSlice = sliceConfig.clone();
                    tempSlice.set(Key.TABLE, table);
                    String queryColumn = HintUtil.buildQueryColumn(jdbcUrl, table, column);
                    tempSlice.set(Key.QUERY_SQL, SingleTableSplitUtil.buildQuerySql(queryColumn, table, where));
                    splittedConfigs.add(tempSlice);
                }
            }
        } else {
            // 说明是配置的 querySql 方式
            List<String> sqls = connConf.getList(Key.QUERY_SQL, String.class);
            // TODO 是否check 配置为多条语句??
            for (String querySql : sqls) {
                tempSlice = sliceConfig.clone();
                tempSlice.set(Key.QUERY_SQL, querySql);
                splittedConfigs.add(tempSlice);
            }
        }
    }
    return splittedConfigs;
}
Also used : Configuration(com.alibaba.datax.common.util.Configuration) ArrayList(java.util.ArrayList)

Example 40 with Configuration

use of com.alibaba.datax.common.util.Configuration in project DataX by alibaba.

the class SingleTableSplitUtil method splitSingleTable.

public static List<Configuration> splitSingleTable(Configuration configuration, int adviceNum) {
    List<Configuration> pluginParams = new ArrayList<Configuration>();
    List<String> rangeList;
    String splitPkName = configuration.getString(Key.SPLIT_PK);
    String column = configuration.getString(Key.COLUMN);
    String table = configuration.getString(Key.TABLE);
    String where = configuration.getString(Key.WHERE, null);
    boolean hasWhere = StringUtils.isNotBlank(where);
    //if (Constant.SPLIT_MODE_RANDOMSAMPLE.equals(splitMode) && DATABASE_TYPE == DataBaseType.Oracle) {
    if (DATABASE_TYPE == DataBaseType.Oracle) {
        rangeList = genSplitSqlForOracle(splitPkName, table, where, configuration, adviceNum);
    // warn: mysql etc to be added...
    } else {
        Pair<Object, Object> minMaxPK = getPkRange(configuration);
        if (null == minMaxPK) {
            throw DataXException.asDataXException(DBUtilErrorCode.ILLEGAL_SPLIT_PK, "根据切分主键切分表失败. DataX 仅支持切分主键为一个,并且类型为整数或者字符串类型. 请尝试使用其他的切分主键或者联系 DBA 进行处理.");
        }
        configuration.set(Key.QUERY_SQL, buildQuerySql(column, table, where));
        if (null == minMaxPK.getLeft() || null == minMaxPK.getRight()) {
            // 切分后获取到的start/end 有 Null 的情况
            pluginParams.add(configuration);
            return pluginParams;
        }
        boolean isStringType = Constant.PK_TYPE_STRING.equals(configuration.getString(Constant.PK_TYPE));
        boolean isLongType = Constant.PK_TYPE_LONG.equals(configuration.getString(Constant.PK_TYPE));
        if (isStringType) {
            rangeList = RdbmsRangeSplitWrap.splitAndWrap(String.valueOf(minMaxPK.getLeft()), String.valueOf(minMaxPK.getRight()), adviceNum, splitPkName, "'", DATABASE_TYPE);
        } else if (isLongType) {
            rangeList = RdbmsRangeSplitWrap.splitAndWrap(new BigInteger(minMaxPK.getLeft().toString()), new BigInteger(minMaxPK.getRight().toString()), adviceNum, splitPkName);
        } else {
            throw DataXException.asDataXException(DBUtilErrorCode.ILLEGAL_SPLIT_PK, "您配置的切分主键(splitPk) 类型 DataX 不支持. DataX 仅支持切分主键为一个,并且类型为整数或者字符串类型. 请尝试使用其他的切分主键或者联系 DBA 进行处理.");
        }
    }
    String tempQuerySql;
    List<String> allQuerySql = new ArrayList<String>();
    if (null != rangeList && !rangeList.isEmpty()) {
        for (String range : rangeList) {
            Configuration tempConfig = configuration.clone();
            tempQuerySql = buildQuerySql(column, table, where) + (hasWhere ? " and " : " where ") + range;
            allQuerySql.add(tempQuerySql);
            tempConfig.set(Key.QUERY_SQL, tempQuerySql);
            pluginParams.add(tempConfig);
        }
    } else {
        //pluginParams.add(configuration); // this is wrong for new & old split
        Configuration tempConfig = configuration.clone();
        tempQuerySql = buildQuerySql(column, table, where) + (hasWhere ? " and " : " where ") + String.format(" %s IS NOT NULL", splitPkName);
        allQuerySql.add(tempQuerySql);
        tempConfig.set(Key.QUERY_SQL, tempQuerySql);
        pluginParams.add(tempConfig);
    }
    // deal pk is null
    Configuration tempConfig = configuration.clone();
    tempQuerySql = buildQuerySql(column, table, where) + (hasWhere ? " and " : " where ") + String.format(" %s IS NULL", splitPkName);
    allQuerySql.add(tempQuerySql);
    LOG.info("After split(), allQuerySql=[\n{}\n].", StringUtils.join(allQuerySql, "\n"));
    tempConfig.set(Key.QUERY_SQL, tempQuerySql);
    pluginParams.add(tempConfig);
    return pluginParams;
}
Also used : Configuration(com.alibaba.datax.common.util.Configuration) ArrayList(java.util.ArrayList) BigInteger(java.math.BigInteger)

Aggregations

Configuration (com.alibaba.datax.common.util.Configuration)82 ArrayList (java.util.ArrayList)27 Test (org.junit.Test)19 Communication (com.alibaba.datax.core.statistics.communication.Communication)13 DataXException (com.alibaba.datax.common.exception.DataXException)9 Method (java.lang.reflect.Method)8 Record (com.alibaba.datax.common.element.Record)7 JobContainer (com.alibaba.datax.core.job.JobContainer)6 IOException (java.io.IOException)5 HBaseConfiguration (org.apache.hadoop.hbase.HBaseConfiguration)5 LongColumn (com.alibaba.datax.common.element.LongColumn)4 TaskPluginCollector (com.alibaba.datax.common.plugin.TaskPluginCollector)4 TaskGroupContainer (com.alibaba.datax.core.taskgroup.TaskGroupContainer)4 Channel (com.alibaba.datax.core.transport.channel.Channel)4 MemoryChannel (com.alibaba.datax.core.transport.channel.memory.MemoryChannel)4 DefaultRecord (com.alibaba.datax.core.transport.record.DefaultRecord)4 File (java.io.File)4 HashSet (java.util.HashSet)3 List (java.util.List)3 VMInfo (com.alibaba.datax.common.statistics.VMInfo)2