Search in sources :

Example 51 with Configuration

use of com.alibaba.datax.common.util.Configuration in project DataX by alibaba.

the class NormalTask method getRowkey.

public byte[] getRowkey(Record record) {
    byte[] rowkeyBuffer = {};
    for (Configuration aRowkeyColumn : rowkeyColumn) {
        Integer index = aRowkeyColumn.getInt(Key.INDEX);
        String type = aRowkeyColumn.getString(Key.TYPE);
        ColumnType columnType = ColumnType.getByTypeName(type);
        if (index == -1) {
            String value = aRowkeyColumn.getString(Key.VALUE);
            rowkeyBuffer = Bytes.add(rowkeyBuffer, getValueByte(columnType, value));
        } else {
            if (index >= record.getColumnNumber()) {
                throw DataXException.asDataXException(Hbase11xWriterErrorCode.CONSTRUCT_ROWKEY_ERROR, String.format("您的rowkeyColumn配置项中中index值超出范围,根据reader端配置,index的值小于%s,而您配置的值为%s,请检查并修改.", record.getColumnNumber(), index));
            }
            byte[] value = getColumnByte(columnType, record.getColumn(index));
            rowkeyBuffer = Bytes.add(rowkeyBuffer, value);
        }
    }
    return rowkeyBuffer;
}
Also used : Configuration(com.alibaba.datax.common.util.Configuration)

Example 52 with Configuration

use of com.alibaba.datax.common.util.Configuration in project DataX by alibaba.

the class HbaseSplitUtil method doSplit.

private static List<Configuration> doSplit(Configuration config, byte[] startRowkeyByte, byte[] endRowkeyByte, Pair<byte[][], byte[][]> regionRanges) {
    List<Configuration> configurations = new ArrayList<Configuration>();
    for (int i = 0; i < regionRanges.getFirst().length; i++) {
        byte[] regionStartKey = regionRanges.getFirst()[i];
        byte[] regionEndKey = regionRanges.getSecond()[i];
        // 注意如果用户指定userEndKey为"",则此判断应该不成立。userEndKey为""表示取得最大的region
        if (Bytes.compareTo(regionEndKey, HConstants.EMPTY_BYTE_ARRAY) == 0 && (endRowkeyByte.length != 0 && (Bytes.compareTo(regionStartKey, endRowkeyByte) > 0))) {
            continue;
        }
        // 用户配置的userStartKey大于等于region的endkey,则这个region不应该含在内
        if ((Bytes.compareTo(regionEndKey, HConstants.EMPTY_BYTE_ARRAY) != 0) && (Bytes.compareTo(startRowkeyByte, regionEndKey) >= 0)) {
            continue;
        }
        // 注意如果用户指定的userEndKey为"",则次判断应该不成立。userEndKey为""表示取得最大的region
        if (endRowkeyByte.length != 0 && (Bytes.compareTo(endRowkeyByte, regionStartKey) <= 0)) {
            continue;
        }
        Configuration p = config.clone();
        String thisStartKey = getStartKey(startRowkeyByte, regionStartKey);
        String thisEndKey = getEndKey(endRowkeyByte, regionEndKey);
        p.set(Key.START_ROWKEY, thisStartKey);
        p.set(Key.END_ROWKEY, thisEndKey);
        LOG.debug("startRowkey:[{}], endRowkey:[{}] .", thisStartKey, thisEndKey);
        configurations.add(p);
    }
    return configurations;
}
Also used : Configuration(com.alibaba.datax.common.util.Configuration) ArrayList(java.util.ArrayList)

Example 53 with Configuration

use of com.alibaba.datax.common.util.Configuration in project DataX by alibaba.

the class HdfsHelper method textFileStartWrite.

/**
     * 写textfile类型文件
     * @param lineReceiver
     * @param config
     * @param fileName
     * @param taskPluginCollector
     */
public void textFileStartWrite(RecordReceiver lineReceiver, Configuration config, String fileName, TaskPluginCollector taskPluginCollector) {
    char fieldDelimiter = config.getChar(Key.FIELD_DELIMITER);
    List<Configuration> columns = config.getListConfiguration(Key.COLUMN);
    String compress = config.getString(Key.COMPRESS, null);
    SimpleDateFormat dateFormat = new SimpleDateFormat("yyyyMMddHHmm");
    String attempt = "attempt_" + dateFormat.format(new Date()) + "_0001_m_000000_0";
    Path outputPath = new Path(fileName);
    //todo 需要进一步确定TASK_ATTEMPT_ID
    conf.set(JobContext.TASK_ATTEMPT_ID, attempt);
    FileOutputFormat outFormat = new TextOutputFormat();
    outFormat.setOutputPath(conf, outputPath);
    outFormat.setWorkOutputPath(conf, outputPath);
    if (null != compress) {
        Class<? extends CompressionCodec> codecClass = getCompressCodec(compress);
        if (null != codecClass) {
            outFormat.setOutputCompressorClass(conf, codecClass);
        }
    }
    try {
        RecordWriter writer = outFormat.getRecordWriter(fileSystem, conf, outputPath.toString(), Reporter.NULL);
        Record record = null;
        while ((record = lineReceiver.getFromReader()) != null) {
            MutablePair<Text, Boolean> transportResult = transportOneRecord(record, fieldDelimiter, columns, taskPluginCollector);
            if (!transportResult.getRight()) {
                writer.write(NullWritable.get(), transportResult.getLeft());
            }
        }
        writer.close(Reporter.NULL);
    } catch (Exception e) {
        String message = String.format("写文件文件[%s]时发生IO异常,请检查您的网络是否正常!", fileName);
        LOG.error(message);
        Path path = new Path(fileName);
        deleteDir(path.getParent());
        throw DataXException.asDataXException(HdfsWriterErrorCode.Write_FILE_IO_ERROR, e);
    }
}
Also used : Configuration(com.alibaba.datax.common.util.Configuration) Text(org.apache.hadoop.io.Text) IOException(java.io.IOException) DataXException(com.alibaba.datax.common.exception.DataXException) Record(com.alibaba.datax.common.element.Record) SimpleDateFormat(java.text.SimpleDateFormat)

Example 54 with Configuration

use of com.alibaba.datax.common.util.Configuration in project DataX by alibaba.

the class HdfsHelper method orcFileStartWrite.

/**
     * 写orcfile类型文件
     * @param lineReceiver
     * @param config
     * @param fileName
     * @param taskPluginCollector
     */
public void orcFileStartWrite(RecordReceiver lineReceiver, Configuration config, String fileName, TaskPluginCollector taskPluginCollector) {
    List<Configuration> columns = config.getListConfiguration(Key.COLUMN);
    String compress = config.getString(Key.COMPRESS, null);
    List<String> columnNames = getColumnNames(columns);
    List<ObjectInspector> columnTypeInspectors = getColumnTypeInspectors(columns);
    StructObjectInspector inspector = (StructObjectInspector) ObjectInspectorFactory.getStandardStructObjectInspector(columnNames, columnTypeInspectors);
    OrcSerde orcSerde = new OrcSerde();
    FileOutputFormat outFormat = new OrcOutputFormat();
    if (!"NONE".equalsIgnoreCase(compress) && null != compress) {
        Class<? extends CompressionCodec> codecClass = getCompressCodec(compress);
        if (null != codecClass) {
            outFormat.setOutputCompressorClass(conf, codecClass);
        }
    }
    try {
        RecordWriter writer = outFormat.getRecordWriter(fileSystem, conf, fileName, Reporter.NULL);
        Record record = null;
        while ((record = lineReceiver.getFromReader()) != null) {
            MutablePair<List<Object>, Boolean> transportResult = transportOneRecord(record, columns, taskPluginCollector);
            if (!transportResult.getRight()) {
                writer.write(NullWritable.get(), orcSerde.serialize(transportResult.getLeft(), inspector));
            }
        }
        writer.close(Reporter.NULL);
    } catch (Exception e) {
        String message = String.format("写文件文件[%s]时发生IO异常,请检查您的网络是否正常!", fileName);
        LOG.error(message);
        Path path = new Path(fileName);
        deleteDir(path.getParent());
        throw DataXException.asDataXException(HdfsWriterErrorCode.Write_FILE_IO_ERROR, e);
    }
}
Also used : ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Configuration(com.alibaba.datax.common.util.Configuration) OrcOutputFormat(org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat) IOException(java.io.IOException) DataXException(com.alibaba.datax.common.exception.DataXException) OrcSerde(org.apache.hadoop.hive.ql.io.orc.OrcSerde) Record(com.alibaba.datax.common.element.Record) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)

Example 55 with Configuration

use of com.alibaba.datax.common.util.Configuration in project DataX by alibaba.

the class AbstractCollector method registerTGCommunication.

public void registerTGCommunication(List<Configuration> taskGroupConfigurationList) {
    for (Configuration config : taskGroupConfigurationList) {
        int taskGroupId = config.getInt(CoreConstant.DATAX_CORE_CONTAINER_TASKGROUP_ID);
        LocalTGCommunicationManager.registerTaskGroupCommunication(taskGroupId, new Communication());
    }
}
Also used : Configuration(com.alibaba.datax.common.util.Configuration) Communication(com.alibaba.datax.core.statistics.communication.Communication)

Aggregations

Configuration (com.alibaba.datax.common.util.Configuration)82 ArrayList (java.util.ArrayList)27 Test (org.junit.Test)19 Communication (com.alibaba.datax.core.statistics.communication.Communication)13 DataXException (com.alibaba.datax.common.exception.DataXException)9 Method (java.lang.reflect.Method)8 Record (com.alibaba.datax.common.element.Record)7 JobContainer (com.alibaba.datax.core.job.JobContainer)6 IOException (java.io.IOException)5 HBaseConfiguration (org.apache.hadoop.hbase.HBaseConfiguration)5 LongColumn (com.alibaba.datax.common.element.LongColumn)4 TaskPluginCollector (com.alibaba.datax.common.plugin.TaskPluginCollector)4 TaskGroupContainer (com.alibaba.datax.core.taskgroup.TaskGroupContainer)4 Channel (com.alibaba.datax.core.transport.channel.Channel)4 MemoryChannel (com.alibaba.datax.core.transport.channel.memory.MemoryChannel)4 DefaultRecord (com.alibaba.datax.core.transport.record.DefaultRecord)4 File (java.io.File)4 HashSet (java.util.HashSet)3 List (java.util.List)3 VMInfo (com.alibaba.datax.common.statistics.VMInfo)2