Search in sources :

Example 6 with OrcSerde

use of org.apache.hadoop.hive.ql.io.orc.OrcSerde in project DataX by alibaba.

the class DFSUtil method orcFileStartRead.

public void orcFileStartRead(String sourceOrcFilePath, Configuration readerSliceConfig, RecordSender recordSender, TaskPluginCollector taskPluginCollector) {
    LOG.info(String.format("Start Read orcfile [%s].", sourceOrcFilePath));
    List<ColumnEntry> column = UnstructuredStorageReaderUtil.getListColumnEntry(readerSliceConfig, com.alibaba.datax.plugin.unstructuredstorage.reader.Key.COLUMN);
    String nullFormat = readerSliceConfig.getString(com.alibaba.datax.plugin.unstructuredstorage.reader.Key.NULL_FORMAT);
    StringBuilder allColumns = new StringBuilder();
    StringBuilder allColumnTypes = new StringBuilder();
    boolean isReadAllColumns = false;
    int columnIndexMax = -1;
    // 判断是否读取所有列
    if (null == column || column.size() == 0) {
        int allColumnsCount = getAllColumnsCount(sourceOrcFilePath);
        columnIndexMax = allColumnsCount - 1;
        isReadAllColumns = true;
    } else {
        columnIndexMax = getMaxIndex(column);
    }
    for (int i = 0; i <= columnIndexMax; i++) {
        allColumns.append("col");
        allColumnTypes.append("string");
        if (i != columnIndexMax) {
            allColumns.append(",");
            allColumnTypes.append(":");
        }
    }
    if (columnIndexMax >= 0) {
        JobConf conf = new JobConf(hadoopConf);
        Path orcFilePath = new Path(sourceOrcFilePath);
        Properties p = new Properties();
        p.setProperty("columns", allColumns.toString());
        p.setProperty("columns.types", allColumnTypes.toString());
        try {
            OrcSerde serde = new OrcSerde();
            serde.initialize(conf, p);
            StructObjectInspector inspector = (StructObjectInspector) serde.getObjectInspector();
            InputFormat<?, ?> in = new OrcInputFormat();
            FileInputFormat.setInputPaths(conf, orcFilePath.toString());
            //If the network disconnected, will retry 45 times, each time the retry interval for 20 seconds
            //Each file as a split
            //TODO multy threads
            InputSplit[] splits = in.getSplits(conf, 1);
            RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL);
            Object key = reader.createKey();
            Object value = reader.createValue();
            // 获取列信息
            List<? extends StructField> fields = inspector.getAllStructFieldRefs();
            List<Object> recordFields;
            while (reader.next(key, value)) {
                recordFields = new ArrayList<Object>();
                for (int i = 0; i <= columnIndexMax; i++) {
                    Object field = inspector.getStructFieldData(value, fields.get(i));
                    recordFields.add(field);
                }
                transportOneRecord(column, recordFields, recordSender, taskPluginCollector, isReadAllColumns, nullFormat);
            }
            reader.close();
        } catch (Exception e) {
            String message = String.format("从orcfile文件路径[%s]中读取数据发生异常,请联系系统管理员。", sourceOrcFilePath);
            LOG.error(message);
            throw DataXException.asDataXException(HdfsReaderErrorCode.READ_FILE_ERROR, message);
        }
    } else {
        String message = String.format("请确认您所读取的列配置正确!columnIndexMax 小于0,column:%s", JSON.toJSONString(column));
        throw DataXException.asDataXException(HdfsReaderErrorCode.BAD_CONFIG_VALUE, message);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) RCFileRecordReader(org.apache.hadoop.hive.ql.io.RCFileRecordReader) ColumnEntry(com.alibaba.datax.plugin.unstructuredstorage.reader.ColumnEntry) IOException(java.io.IOException) DataXException(com.alibaba.datax.common.exception.DataXException) OrcSerde(org.apache.hadoop.hive.ql.io.orc.OrcSerde) OrcInputFormat(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat) JSONObject(com.alibaba.fastjson.JSONObject) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)

Example 7 with OrcSerde

use of org.apache.hadoop.hive.ql.io.orc.OrcSerde in project DataX by alibaba.

the class HdfsHelper method orcFileStartWrite.

/**
     * 写orcfile类型文件
     * @param lineReceiver
     * @param config
     * @param fileName
     * @param taskPluginCollector
     */
public void orcFileStartWrite(RecordReceiver lineReceiver, Configuration config, String fileName, TaskPluginCollector taskPluginCollector) {
    List<Configuration> columns = config.getListConfiguration(Key.COLUMN);
    String compress = config.getString(Key.COMPRESS, null);
    List<String> columnNames = getColumnNames(columns);
    List<ObjectInspector> columnTypeInspectors = getColumnTypeInspectors(columns);
    StructObjectInspector inspector = (StructObjectInspector) ObjectInspectorFactory.getStandardStructObjectInspector(columnNames, columnTypeInspectors);
    OrcSerde orcSerde = new OrcSerde();
    FileOutputFormat outFormat = new OrcOutputFormat();
    if (!"NONE".equalsIgnoreCase(compress) && null != compress) {
        Class<? extends CompressionCodec> codecClass = getCompressCodec(compress);
        if (null != codecClass) {
            outFormat.setOutputCompressorClass(conf, codecClass);
        }
    }
    try {
        RecordWriter writer = outFormat.getRecordWriter(fileSystem, conf, fileName, Reporter.NULL);
        Record record = null;
        while ((record = lineReceiver.getFromReader()) != null) {
            MutablePair<List<Object>, Boolean> transportResult = transportOneRecord(record, columns, taskPluginCollector);
            if (!transportResult.getRight()) {
                writer.write(NullWritable.get(), orcSerde.serialize(transportResult.getLeft(), inspector));
            }
        }
        writer.close(Reporter.NULL);
    } catch (Exception e) {
        String message = String.format("写文件文件[%s]时发生IO异常,请检查您的网络是否正常!", fileName);
        LOG.error(message);
        Path path = new Path(fileName);
        deleteDir(path.getParent());
        throw DataXException.asDataXException(HdfsWriterErrorCode.Write_FILE_IO_ERROR, e);
    }
}
Also used : ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Configuration(com.alibaba.datax.common.util.Configuration) OrcOutputFormat(org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat) IOException(java.io.IOException) DataXException(com.alibaba.datax.common.exception.DataXException) OrcSerde(org.apache.hadoop.hive.ql.io.orc.OrcSerde) Record(com.alibaba.datax.common.element.Record) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)

Aggregations

OrcSerde (org.apache.hadoop.hive.ql.io.orc.OrcSerde)7 Serializer (org.apache.hadoop.hive.serde2.Serializer)3 SettableStructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector)3 StructField (org.apache.hadoop.hive.serde2.objectinspector.StructField)3 Writable (org.apache.hadoop.io.Writable)3 DataXException (com.alibaba.datax.common.exception.DataXException)2 OrcTester.createSettableStructObjectInspector (com.facebook.presto.orc.OrcTester.createSettableStructObjectInspector)2 IOException (java.io.IOException)2 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)2 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)2 Record (com.alibaba.datax.common.element.Record)1 Configuration (com.alibaba.datax.common.util.Configuration)1 ColumnEntry (com.alibaba.datax.plugin.unstructuredstorage.reader.ColumnEntry)1 JSONObject (com.alibaba.fastjson.JSONObject)1 Path (org.apache.hadoop.fs.Path)1 RCFileRecordReader (org.apache.hadoop.hive.ql.io.RCFileRecordReader)1 OrcInputFormat (org.apache.hadoop.hive.ql.io.orc.OrcInputFormat)1 OrcOutputFormat (org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat)1 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)1 CharTypeInfo (org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo)1