Search in sources :

Example 6 with OrcInputFormat

use of org.apache.hadoop.hive.ql.io.orc.OrcInputFormat in project DataX by alibaba.

the class DFSUtil method orcFileStartRead.

public void orcFileStartRead(String sourceOrcFilePath, Configuration readerSliceConfig, RecordSender recordSender, TaskPluginCollector taskPluginCollector) {
    LOG.info(String.format("Start Read orcfile [%s].", sourceOrcFilePath));
    List<ColumnEntry> column = UnstructuredStorageReaderUtil.getListColumnEntry(readerSliceConfig, com.alibaba.datax.plugin.unstructuredstorage.reader.Key.COLUMN);
    String nullFormat = readerSliceConfig.getString(com.alibaba.datax.plugin.unstructuredstorage.reader.Key.NULL_FORMAT);
    StringBuilder allColumns = new StringBuilder();
    StringBuilder allColumnTypes = new StringBuilder();
    boolean isReadAllColumns = false;
    int columnIndexMax = -1;
    // 判断是否读取所有列
    if (null == column || column.size() == 0) {
        int allColumnsCount = getAllColumnsCount(sourceOrcFilePath);
        columnIndexMax = allColumnsCount - 1;
        isReadAllColumns = true;
    } else {
        columnIndexMax = getMaxIndex(column);
    }
    for (int i = 0; i <= columnIndexMax; i++) {
        allColumns.append("col");
        allColumnTypes.append("string");
        if (i != columnIndexMax) {
            allColumns.append(",");
            allColumnTypes.append(":");
        }
    }
    if (columnIndexMax >= 0) {
        JobConf conf = new JobConf(hadoopConf);
        Path orcFilePath = new Path(sourceOrcFilePath);
        Properties p = new Properties();
        p.setProperty("columns", allColumns.toString());
        p.setProperty("columns.types", allColumnTypes.toString());
        try {
            OrcSerde serde = new OrcSerde();
            serde.initialize(conf, p);
            StructObjectInspector inspector = (StructObjectInspector) serde.getObjectInspector();
            InputFormat<?, ?> in = new OrcInputFormat();
            FileInputFormat.setInputPaths(conf, orcFilePath.toString());
            // If the network disconnected, will retry 45 times, each time the retry interval for 20 seconds
            // Each file as a split
            // TODO multy threads
            InputSplit[] splits = in.getSplits(conf, 1);
            RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL);
            Object key = reader.createKey();
            Object value = reader.createValue();
            // 获取列信息
            List<? extends StructField> fields = inspector.getAllStructFieldRefs();
            List<Object> recordFields;
            while (reader.next(key, value)) {
                recordFields = new ArrayList<Object>();
                for (int i = 0; i <= columnIndexMax; i++) {
                    Object field = inspector.getStructFieldData(value, fields.get(i));
                    recordFields.add(field);
                }
                transportOneRecord(column, recordFields, recordSender, taskPluginCollector, isReadAllColumns, nullFormat);
            }
            reader.close();
        } catch (Exception e) {
            String message = String.format("从orcfile文件路径[%s]中读取数据发生异常,请联系系统管理员。", sourceOrcFilePath);
            LOG.error(message);
            throw DataXException.asDataXException(HdfsReaderErrorCode.READ_FILE_ERROR, message);
        }
    } else {
        String message = String.format("请确认您所读取的列配置正确!columnIndexMax 小于0,column:%s", JSON.toJSONString(column));
        throw DataXException.asDataXException(HdfsReaderErrorCode.BAD_CONFIG_VALUE, message);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) RCFileRecordReader(org.apache.hadoop.hive.ql.io.RCFileRecordReader) ColumnEntry(com.alibaba.datax.plugin.unstructuredstorage.reader.ColumnEntry) IOException(java.io.IOException) DataXException(com.alibaba.datax.common.exception.DataXException) OrcSerde(org.apache.hadoop.hive.ql.io.orc.OrcSerde) OrcInputFormat(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat) JSONObject(com.alibaba.fastjson.JSONObject) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)

Example 7 with OrcInputFormat

use of org.apache.hadoop.hive.ql.io.orc.OrcInputFormat in project hive by apache.

the class TestCompactor method checkExpectedTxnsPresent.

private void checkExpectedTxnsPresent(Path base, Path[] deltas, String columnNamesProperty, String columnTypesProperty, int bucket, long min, long max, int numBuckets) throws IOException {
    ValidWriteIdList writeIdList = new ValidWriteIdList() {

        @Override
        public String getTableName() {
            return "AcidTable";
        }

        @Override
        public boolean isWriteIdValid(long writeid) {
            return true;
        }

        @Override
        public RangeResponse isWriteIdRangeValid(long minWriteId, long maxWriteId) {
            return RangeResponse.ALL;
        }

        @Override
        public String writeToString() {
            return "";
        }

        @Override
        public void readFromString(String src) {
        }

        @Override
        public Long getMinOpenWriteId() {
            return null;
        }

        @Override
        public long getHighWatermark() {
            return Long.MAX_VALUE;
        }

        @Override
        public long[] getInvalidWriteIds() {
            return new long[0];
        }

        @Override
        public boolean isValidBase(long writeid) {
            return true;
        }

        @Override
        public boolean isWriteIdAborted(long writeid) {
            return true;
        }

        @Override
        public RangeResponse isWriteIdRangeAborted(long minWriteId, long maxWriteId) {
            return RangeResponse.ALL;
        }
    };
    OrcInputFormat aif = new OrcInputFormat();
    Configuration conf = new Configuration();
    conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, columnNamesProperty);
    conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, columnTypesProperty);
    conf.set(hive_metastoreConstants.BUCKET_COUNT, Integer.toString(numBuckets));
    HiveConf.setBoolVar(conf, HiveConf.ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN, true);
    AcidInputFormat.RawReader<OrcStruct> reader = aif.getRawReader(conf, true, bucket, writeIdList, base, deltas);
    RecordIdentifier identifier = reader.createKey();
    OrcStruct value = reader.createValue();
    long currentTxn = min;
    boolean seenCurrentTxn = false;
    while (reader.next(identifier, value)) {
        if (!seenCurrentTxn) {
            Assert.assertEquals(currentTxn, identifier.getWriteId());
            seenCurrentTxn = true;
        }
        if (currentTxn != identifier.getWriteId()) {
            Assert.assertEquals(currentTxn + 1, identifier.getWriteId());
            currentTxn++;
        }
    }
    Assert.assertEquals(max, currentTxn);
}
Also used : AcidInputFormat(org.apache.hadoop.hive.ql.io.AcidInputFormat) RecordIdentifier(org.apache.hadoop.hive.ql.io.RecordIdentifier) OrcStruct(org.apache.hadoop.hive.ql.io.orc.OrcStruct) Configuration(org.apache.hadoop.conf.Configuration) OrcInputFormat(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat) ValidWriteIdList(org.apache.hadoop.hive.common.ValidWriteIdList)

Example 8 with OrcInputFormat

use of org.apache.hadoop.hive.ql.io.orc.OrcInputFormat in project hive by apache.

the class TestStreaming method checkDataWritten.

/**
 * @deprecated use {@link #checkDataWritten2(Path, long, long, int, String, boolean, String...)} -
 * there is little value in using InputFormat directly
 */
@Deprecated
private void checkDataWritten(Path partitionPath, long minTxn, long maxTxn, int buckets, int numExpectedFiles, String... records) throws Exception {
    ValidWriteIdList writeIds = msClient.getValidWriteIds(AcidUtils.getFullTableName(dbName, tblName));
    AcidUtils.Directory dir = AcidUtils.getAcidState(partitionPath, conf, writeIds);
    Assert.assertEquals(0, dir.getObsolete().size());
    Assert.assertEquals(0, dir.getOriginalFiles().size());
    List<AcidUtils.ParsedDelta> current = dir.getCurrentDirectories();
    System.out.println("Files found: ");
    for (AcidUtils.ParsedDelta pd : current) {
        System.out.println(pd.getPath().toString());
    }
    Assert.assertEquals(numExpectedFiles, current.size());
    // find the absolute minimum transaction
    long min = Long.MAX_VALUE;
    long max = Long.MIN_VALUE;
    for (AcidUtils.ParsedDelta pd : current) {
        if (pd.getMaxWriteId() > max) {
            max = pd.getMaxWriteId();
        }
        if (pd.getMinWriteId() < min) {
            min = pd.getMinWriteId();
        }
    }
    Assert.assertEquals(minTxn, min);
    Assert.assertEquals(maxTxn, max);
    InputFormat inf = new OrcInputFormat();
    JobConf job = new JobConf();
    job.set("mapred.input.dir", partitionPath.toString());
    job.set(BUCKET_COUNT, Integer.toString(buckets));
    job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, "id,msg");
    job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, "bigint:string");
    AcidUtils.setAcidOperationalProperties(job, true, null);
    job.setBoolean(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL, true);
    job.set(ValidWriteIdList.VALID_WRITEIDS_KEY, writeIds.toString());
    InputSplit[] splits = inf.getSplits(job, buckets);
    Assert.assertEquals(numExpectedFiles, splits.length);
    org.apache.hadoop.mapred.RecordReader<NullWritable, OrcStruct> rr = inf.getRecordReader(splits[0], job, Reporter.NULL);
    NullWritable key = rr.createKey();
    OrcStruct value = rr.createValue();
    for (String record : records) {
        Assert.assertEquals(true, rr.next(key, value));
        Assert.assertEquals(record, value.toString());
    }
    Assert.assertEquals(false, rr.next(key, value));
}
Also used : NullWritable(org.apache.hadoop.io.NullWritable) OrcStruct(org.apache.hadoop.hive.ql.io.orc.OrcStruct) OrcInputFormat(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat) ValidWriteIdList(org.apache.hadoop.hive.common.ValidWriteIdList) InputFormat(org.apache.hadoop.mapred.InputFormat) OrcInputFormat(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) OrcAcidUtils(org.apache.orc.impl.OrcAcidUtils) AcidUtils(org.apache.hadoop.hive.ql.io.AcidUtils)

Example 9 with OrcInputFormat

use of org.apache.hadoop.hive.ql.io.orc.OrcInputFormat in project Datax by n-kong.

the class DFSUtil method orcFileStartRead.

public void orcFileStartRead(String sourceOrcFilePath, Configuration readerSliceConfig, RecordSender recordSender, TaskPluginCollector taskPluginCollector) {
    LOG.info(String.format("Start Read orcfile [%s].", sourceOrcFilePath));
    List<ColumnEntry> column = UnstructuredStorageReaderUtil.getListColumnEntry(readerSliceConfig, com.alibaba.datax.plugin.unstructuredstorage.reader.Key.COLUMN);
    String nullFormat = readerSliceConfig.getString(com.alibaba.datax.plugin.unstructuredstorage.reader.Key.NULL_FORMAT);
    StringBuilder allColumns = new StringBuilder();
    StringBuilder allColumnTypes = new StringBuilder();
    boolean isReadAllColumns = false;
    int columnIndexMax = -1;
    // 判断是否读取所有列
    if (null == column || column.size() == 0) {
        int allColumnsCount = getAllColumnsCount(sourceOrcFilePath);
        columnIndexMax = allColumnsCount - 1;
        isReadAllColumns = true;
    } else {
        columnIndexMax = getMaxIndex(column);
    }
    for (int i = 0; i <= columnIndexMax; i++) {
        allColumns.append("col");
        allColumnTypes.append("string");
        if (i != columnIndexMax) {
            allColumns.append(",");
            allColumnTypes.append(":");
        }
    }
    if (columnIndexMax >= 0) {
        JobConf conf = new JobConf(hadoopConf);
        Path orcFilePath = new Path(sourceOrcFilePath);
        Properties p = new Properties();
        p.setProperty("columns", allColumns.toString());
        p.setProperty("columns.types", allColumnTypes.toString());
        try {
            OrcSerde serde = new OrcSerde();
            serde.initialize(conf, p);
            StructObjectInspector inspector = (StructObjectInspector) serde.getObjectInspector();
            InputFormat<?, ?> in = new OrcInputFormat();
            FileInputFormat.setInputPaths(conf, orcFilePath.toString());
            // If the network disconnected, will retry 45 times, each time the retry interval for 20 seconds
            // Each file as a split
            // TODO multy threads
            InputSplit[] splits = in.getSplits(conf, 1);
            RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL);
            Object key = reader.createKey();
            Object value = reader.createValue();
            // 获取列信息
            List<? extends StructField> fields = inspector.getAllStructFieldRefs();
            List<Object> recordFields;
            while (reader.next(key, value)) {
                recordFields = new ArrayList<Object>();
                for (int i = 0; i <= columnIndexMax; i++) {
                    Object field = inspector.getStructFieldData(value, fields.get(i));
                    recordFields.add(field);
                }
                transportOneRecord(column, recordFields, recordSender, taskPluginCollector, isReadAllColumns, nullFormat);
            }
            reader.close();
        } catch (Exception e) {
            String message = String.format("从orcfile文件路径[%s]中读取数据发生异常,请联系系统管理员。", sourceOrcFilePath);
            LOG.error(message);
            throw DataXException.asDataXException(HdfsReaderErrorCode.READ_FILE_ERROR, message);
        }
    } else {
        String message = String.format("请确认您所读取的列配置正确!columnIndexMax 小于0,column:%s", JSON.toJSONString(column));
        throw DataXException.asDataXException(HdfsReaderErrorCode.BAD_CONFIG_VALUE, message);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) RCFileRecordReader(org.apache.hadoop.hive.ql.io.RCFileRecordReader) ColumnEntry(com.alibaba.datax.plugin.unstructuredstorage.reader.ColumnEntry) IOException(java.io.IOException) DataXException(com.alibaba.datax.common.exception.DataXException) OrcSerde(org.apache.hadoop.hive.ql.io.orc.OrcSerde) OrcInputFormat(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat) JSONObject(com.alibaba.fastjson.JSONObject) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)

Example 10 with OrcInputFormat

use of org.apache.hadoop.hive.ql.io.orc.OrcInputFormat in project pxf by greenplum-db.

the class ProfileFactoryTest method get.

@Test
public void get() {
    // if user specified vectorized ORC, no matter what the input format is, the profile (as in parameter) should be used
    String profileName = ProfileFactory.get(new TextInputFormat(), false, "HiveVectorizedORC");
    assertEquals("HiveVectorizedORC", profileName);
    profileName = ProfileFactory.get(new TextInputFormat(), false, "hivevectorizedorc");
    assertEquals("hivevectorizedorc", profileName);
    // For TextInputFormat when table has no complex types, HiveText profile should be used
    profileName = ProfileFactory.get(new TextInputFormat(), false);
    assertEquals("hive:text", profileName);
    // For TextInputFormat when table has complex types, Hive profile should be used, HiveText doesn't support complex types yet
    profileName = ProfileFactory.get(new TextInputFormat(), true);
    assertEquals("hive", profileName);
    // For RCFileInputFormat when table has complex types, HiveRC profile should be used
    profileName = ProfileFactory.get(new RCFileInputFormat(), true);
    assertEquals("hive:rc", profileName);
    // For RCFileInputFormat when table has no complex types, HiveRC profile should be used
    profileName = ProfileFactory.get(new RCFileInputFormat(), false);
    assertEquals("hive:rc", profileName);
    // For OrcInputFormat when table has complex types, HiveORC profile should be used
    profileName = ProfileFactory.get(new OrcInputFormat(), true);
    assertEquals("hive:orc", profileName);
    // For OrcInputFormat when table has no complex types, HiveORC profile should be used
    profileName = ProfileFactory.get(new OrcInputFormat(), false);
    assertEquals("hive:orc", profileName);
    // For other formats Hive profile should be used
    profileName = ProfileFactory.get(new SequenceFileInputFilter(), false);
    assertEquals("hive", profileName);
}
Also used : TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) RCFileInputFormat(org.apache.hadoop.hive.ql.io.RCFileInputFormat) OrcInputFormat(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat) SequenceFileInputFilter(org.apache.hadoop.mapred.SequenceFileInputFilter) Test(org.junit.jupiter.api.Test)

Aggregations

OrcInputFormat (org.apache.hadoop.hive.ql.io.orc.OrcInputFormat)10 OrcStruct (org.apache.hadoop.hive.ql.io.orc.OrcStruct)6 ValidWriteIdList (org.apache.hadoop.hive.common.ValidWriteIdList)4 RecordIdentifier (org.apache.hadoop.hive.ql.io.RecordIdentifier)4 NullWritable (org.apache.hadoop.io.NullWritable)4 InputSplit (org.apache.hadoop.mapred.InputSplit)4 JobConf (org.apache.hadoop.mapred.JobConf)4 ArrayList (java.util.ArrayList)3 DataXException (com.alibaba.datax.common.exception.DataXException)2 ColumnEntry (com.alibaba.datax.plugin.unstructuredstorage.reader.ColumnEntry)2 JSONObject (com.alibaba.fastjson.JSONObject)2 IOException (java.io.IOException)2 Configuration (org.apache.hadoop.conf.Configuration)2 Path (org.apache.hadoop.fs.Path)2 AcidInputFormat (org.apache.hadoop.hive.ql.io.AcidInputFormat)2 AcidRecordReader (org.apache.hadoop.hive.ql.io.AcidInputFormat.AcidRecordReader)2 AcidUtils (org.apache.hadoop.hive.ql.io.AcidUtils)2 RCFileRecordReader (org.apache.hadoop.hive.ql.io.RCFileRecordReader)2 OrcSerde (org.apache.hadoop.hive.ql.io.orc.OrcSerde)2 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)2