Search in sources :

Example 11 with StripeInformation

use of org.apache.orc.StripeInformation in project hive by apache.

the class TestInputOutputFormat method testAcidReadPastLastStripeOffset.

@Test
public void testAcidReadPastLastStripeOffset() throws Exception {
    Path baseDir = new Path(workDir, "base_00100");
    testFilePath = new Path(baseDir, "bucket_00000");
    fs.mkdirs(baseDir);
    fs.delete(testFilePath, true);
    TypeDescription fileSchema = TypeDescription.fromString("struct<operation:int," + "originalTransaction:bigint,bucket:int,rowId:bigint," + "currentTransaction:bigint," + "row:struct<a:int,b:struct<c:int>,d:string>>");
    OrcRecordUpdater.KeyIndexBuilder indexBuilder = new OrcRecordUpdater.KeyIndexBuilder("test");
    OrcFile.WriterOptions options = OrcFile.writerOptions(conf).fileSystem(fs).setSchema(fileSchema).compress(org.apache.orc.CompressionKind.NONE).callback(indexBuilder).stripeSize(128);
    // Create ORC file with small stripe size so we can write multiple stripes.
    Writer writer = OrcFile.createWriter(testFilePath, options);
    VectorizedRowBatch batch = fileSchema.createRowBatch(TypeDescription.RowBatchVersion.USE_DECIMAL64, 1000);
    batch.size = 1000;
    StructColumnVector scv = (StructColumnVector) batch.cols[5];
    // operation
    batch.cols[0].isRepeating = true;
    ((LongColumnVector) batch.cols[0]).vector[0] = OrcRecordUpdater.INSERT_OPERATION;
    // original transaction
    batch.cols[1].isRepeating = true;
    ((LongColumnVector) batch.cols[1]).vector[0] = 1;
    // bucket
    batch.cols[2].isRepeating = true;
    ((LongColumnVector) batch.cols[2]).vector[0] = BucketCodec.V1.encode(new AcidOutputFormat.Options(conf).bucket(0).statementId(0));
    // current transaction
    batch.cols[4].isRepeating = true;
    ((LongColumnVector) batch.cols[4]).vector[0] = 1;
    LongColumnVector lcv = (LongColumnVector) ((StructColumnVector) scv.fields[1]).fields[0];
    for (int r = 0; r < 1000; r++) {
        // row id
        ((LongColumnVector) batch.cols[3]).vector[r] = r;
        // a
        ((LongColumnVector) scv.fields[0]).vector[r] = r * 42;
        // b.c
        lcv.vector[r] = r * 10001;
        // d
        ((BytesColumnVector) scv.fields[2]).setVal(r, Integer.toHexString(r).getBytes(StandardCharsets.UTF_8));
        indexBuilder.addKey(OrcRecordUpdater.INSERT_OPERATION, 1, (int) (((LongColumnVector) batch.cols[2]).vector[0]), r);
    }
    // Minimum 5000 rows per stripe.
    for (int idx = 0; idx < 8; ++idx) {
        writer.addRowBatch(batch);
        // bucket
        batch.cols[2].isRepeating = true;
        ((LongColumnVector) batch.cols[2]).vector[0] = BucketCodec.V1.encode(new AcidOutputFormat.Options(conf).bucket(0).statementId(idx + 1));
        for (long row_id : ((LongColumnVector) batch.cols[3]).vector) {
            indexBuilder.addKey(OrcRecordUpdater.INSERT_OPERATION, 1, (int) (((LongColumnVector) batch.cols[2]).vector[0]), row_id);
        }
    }
    writer.close();
    long fileLength = fs.getFileStatus(testFilePath).getLen();
    // Find the last stripe.
    List<StripeInformation> stripes;
    RecordIdentifier[] keyIndex;
    try (Reader orcReader = OrcFile.createReader(fs, testFilePath)) {
        stripes = orcReader.getStripes();
        keyIndex = OrcRecordUpdater.parseKeyIndex(orcReader);
    }
    StripeInformation lastStripe = stripes.get(stripes.size() - 1);
    long lastStripeOffset = lastStripe.getOffset();
    long lastStripeLength = lastStripe.getLength();
    Assert.assertEquals("Index length doesn't match number of stripes", stripes.size(), keyIndex.length);
    Assert.assertEquals("1st Index entry mismatch", new RecordIdentifier(1, 536870916, 999), keyIndex[0]);
    Assert.assertEquals("2nd Index entry mismatch", new RecordIdentifier(1, 536870920, 999), keyIndex[1]);
    // test with same schema with include
    conf.set(ValidTxnList.VALID_TXNS_KEY, "100:99:");
    conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, "a,b,d");
    conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, "int,struct<c:int>,string");
    conf.set(ColumnProjectionUtils.READ_ALL_COLUMNS, "false");
    conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0,2");
    LOG.info("Last stripe " + stripes.size() + ", offset " + lastStripeOffset + ", length " + lastStripeLength);
    // Specify an OrcSplit that starts beyond the offset of the last stripe.
    OrcSplit split = new OrcSplit(testFilePath, null, lastStripeOffset + 1, lastStripeLength, new String[0], null, false, true, new ArrayList<AcidInputFormat.DeltaMetaData>(), fileLength, fileLength, workDir, null);
    OrcInputFormat inputFormat = new OrcInputFormat();
    AcidInputFormat.RowReader<OrcStruct> reader = inputFormat.getReader(split, new AcidInputFormat.Options(conf));
    int record = 0;
    OrcRawRecordMerger.ReaderKey id = reader.createKey();
    OrcStruct struct = reader.createValue();
    // not be read. Thus 0 records.
    while (reader.next(id, struct)) {
        record += 1;
    }
    assertEquals(0, record);
    reader.close();
}
Also used : RecordIdentifier(org.apache.hadoop.hive.ql.io.RecordIdentifier) AcidInputFormat(org.apache.hadoop.hive.ql.io.AcidInputFormat) VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) StructColumnVector(org.apache.hadoop.hive.ql.exec.vector.StructColumnVector) BytesColumnVector(org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector) TypeDescription(org.apache.orc.TypeDescription) LongColumnVector(org.apache.hadoop.hive.ql.exec.vector.LongColumnVector) RecordWriter(org.apache.hadoop.mapred.RecordWriter) StripeInformation(org.apache.orc.StripeInformation) Test(org.junit.Test)

Example 12 with StripeInformation

use of org.apache.orc.StripeInformation in project hive by apache.

the class OrcFileStripeMergeRecordReader method nextStripe.

protected boolean nextStripe(OrcFileKeyWrapper keyWrapper, OrcFileValueWrapper valueWrapper) throws IOException {
    // is present. We have to differentiate no stats (empty file) vs missing stats (old format).
    if ((stripeStatistics == null || stripeStatistics.isEmpty()) && reader.getNumberOfRows() > 0) {
        keyWrapper.setInputPath(path);
        keyWrapper.setIsIncompatFile(true);
        skipFile = true;
        return true;
    }
    // file split starts with 0 and hence this mapper owns concatenate of all stripes in the file.
    if (iter.hasNext()) {
        StripeInformation si = iter.next();
        valueWrapper.setStripeStatistics(stripeStatistics.get(stripeIdx));
        valueWrapper.setStripeInformation(si);
        if (!iter.hasNext()) {
            valueWrapper.setLastStripeInFile(true);
            Map<String, ByteBuffer> userMeta = new HashMap<>();
            for (String key : reader.getMetadataKeys()) {
                userMeta.put(key, reader.getMetadataValue(key));
            }
            valueWrapper.setUserMetadata(userMeta);
        }
        keyWrapper.setInputPath(path);
        keyWrapper.setCompression(reader.getCompressionKind());
        keyWrapper.setCompressBufferSize(reader.getCompressionSize());
        keyWrapper.setFileVersion(reader.getFileVersion());
        keyWrapper.setWriterVersion(reader.getWriterVersion());
        keyWrapper.setRowIndexStride(reader.getRowIndexStride());
        keyWrapper.setFileSchema(reader.getSchema());
        stripeIdx++;
        return true;
    }
    return false;
}
Also used : HashMap(java.util.HashMap) ByteBuffer(java.nio.ByteBuffer) StripeInformation(org.apache.orc.StripeInformation)

Example 13 with StripeInformation

use of org.apache.orc.StripeInformation in project hive by apache.

the class OrcRawRecordMerger method discoverOriginalKeyBounds.

/**
 * Find the key range for original bucket files.
 * For unbucketed tables the insert event data is still written to bucket_N file except that
 * N is just a writer ID - it still matches {@link RecordIdentifier#getBucketProperty()}.  For
 * 'original' files (ubucketed) the same applies.  A file 000000_0 encodes a taskId/wirterId and
 * at read time we synthesize {@link RecordIdentifier#getBucketProperty()} to match the file name
 * and so the same bucketProperty is used here to create minKey/maxKey, i.e. these keys are valid
 * to filter data from delete_delta files even for unbucketed tables.
 * @param reader the reader
 * @param bucket the bucket number we are reading
 * @param options the options for reading with
 * @throws IOException
 */
private KeyInterval discoverOriginalKeyBounds(Reader reader, int bucket, Reader.Options options, Configuration conf, Options mergerOptions) throws IOException {
    long rowLength = 0;
    long rowOffset = 0;
    // this would usually be at block boundary
    long offset = options.getOffset();
    // this would usually be at block boundary
    long maxOffset = options.getMaxOffset();
    boolean isTail = true;
    RecordIdentifier minKey = null;
    RecordIdentifier maxKey = null;
    TransactionMetaData tfp = TransactionMetaData.findWriteIDForSynthetcRowIDs(mergerOptions.getBucketPath(), mergerOptions.getRootPath(), conf);
    int bucketProperty = encodeBucketId(conf, bucket, tfp.statementId);
    /**
     * options.getOffset() and getMaxOffset() would usually be at block boundary which doesn't
     * necessarily match stripe boundary.  So we want to come up with minKey to be one before the 1st
     * row of the first stripe that starts after getOffset() and maxKey to be the last row of the
     * stripe that contains getMaxOffset().  This breaks if getOffset() and getMaxOffset() are inside
     * the sames tripe - in this case we have minKey & isTail=false but rowLength is never set.
     * (HIVE-16953)
     */
    for (StripeInformation stripe : reader.getStripes()) {
        if (offset > stripe.getOffset()) {
            rowOffset += stripe.getNumberOfRows();
        } else if (maxOffset > stripe.getOffset()) {
            rowLength += stripe.getNumberOfRows();
        } else {
            isTail = false;
            break;
        }
    }
    if (rowOffset > 0) {
        minKey = new RecordIdentifier(tfp.syntheticWriteId, bucketProperty, rowOffset - 1);
    }
    if (!isTail) {
        maxKey = new RecordIdentifier(tfp.syntheticWriteId, bucketProperty, rowOffset + rowLength - 1);
    }
    return new KeyInterval(minKey, maxKey);
}
Also used : RecordIdentifier(org.apache.hadoop.hive.ql.io.RecordIdentifier) StripeInformation(org.apache.orc.StripeInformation)

Example 14 with StripeInformation

use of org.apache.orc.StripeInformation in project hive by apache.

the class VectorizedOrcAcidRowBatchReader method findMinMaxKeys.

/**
 * A given ORC reader will always process one or more whole stripes but the
 * split boundaries may not line up with stripe boundaries if the InputFormat
 * doesn't understand ORC specifics. So first we need to figure out which
 * stripe(s) we are reading.
 *
 * Suppose txn1 writes 100K rows
 * and txn2 writes 100 rows so we have events
 * {1,0,0}....{1,0,100K},{2,0,0}...{2,0,100} in 2 files
 * After compaction we may have 2 stripes
 * {1,0,0}...{1,0,90K},{1,0,90001}...{2,0,100}
 *
 * Now suppose there is a delete stmt that deletes every row.  So when we load
 * the 2nd stripe, if we just look at stripe {@link ColumnStatistics},
 * minKey={1,0,100} and maxKey={2,0,90001}, all but the 1st 100 delete events
 * will get loaded.  But with {@link OrcRecordUpdater#ACID_KEY_INDEX_NAME},
 * minKey={1,0,90001} and maxKey={2,0,100} so we only load about 10K deletes.
 *
 * Also, even with Query Based compactor (once we have it), FileSinkOperator
 * uses OrcRecordWriter to write to file, so we should have the
 * hive.acid.index in place.
 *
 * If reading the 1st stripe, we don't have the start event, so we'll get it
 * from stats, which will strictly speaking be accurate only wrt writeId and
 * bucket but that is good enough.
 *
 * @return empty <code>KeyInterval</code> if KeyInterval could not be
 * determined
 */
private OrcRawRecordMerger.KeyInterval findMinMaxKeys(OrcSplit orcSplit, Configuration conf, Reader.Options deleteEventReaderOptions) throws IOException {
    final boolean noDeleteDeltas = orcSplit.getDeltas().size() == 0;
    if (!HiveConf.getBoolVar(conf, ConfVars.FILTER_DELETE_EVENTS) || noDeleteDeltas) {
        LOG.debug("findMinMaxKeys() " + ConfVars.FILTER_DELETE_EVENTS + "=false");
        return new OrcRawRecordMerger.KeyInterval(null, null);
    }
    try (VectorizedOrcAcidRowBatchReader.ReaderData orcReaderData = getOrcReaderData(orcSplit.getPath(), conf, cacheTag, orcSplit.getFileKey())) {
        if (orcSplit.isOriginal()) {
            /**
             * Among originals we may have files with _copy_N suffix.  To properly
             * generate a synthetic ROW___ID for them we need
             * {@link OffsetAndBucketProperty} which could be an expensive computation
             * if there are lots of copy_N files for a given bucketId. But unless
             * there are delete events, we often don't need synthetic ROW__IDs at all.
             * Kind of chicken-and-egg - deal with this later.
             * See {@link OrcRawRecordMerger#discoverOriginalKeyBounds(Reader, int,
             * Reader.Options, Configuration, OrcRawRecordMerger.Options)}
             */
            LOG.debug("findMinMaxKeys(original split)");
            return findOriginalMinMaxKeys(orcSplit, orcReaderData.orcTail, deleteEventReaderOptions);
        }
        List<StripeInformation> stripes = orcReaderData.orcTail.getStripes();
        final long splitStart = orcSplit.getStart();
        final long splitEnd = splitStart + orcSplit.getLength();
        int firstStripeIndex = -1;
        int lastStripeIndex = -1;
        for (int i = 0; i < stripes.size(); i++) {
            StripeInformation stripe = stripes.get(i);
            long stripeEnd = stripe.getOffset() + stripe.getLength();
            if (firstStripeIndex == -1 && stripe.getOffset() >= splitStart) {
                firstStripeIndex = i;
            }
            if (lastStripeIndex == -1 && splitEnd <= stripeEnd) {
                lastStripeIndex = i;
            }
        }
        if (lastStripeIndex == -1) {
            // split goes to the EOF which is > end of stripe since file has a footer
            assert stripes.get(stripes.size() - 1).getOffset() + stripes.get(stripes.size() - 1).getLength() < splitEnd;
            lastStripeIndex = stripes.size() - 1;
        }
        if (firstStripeIndex > lastStripeIndex || firstStripeIndex == -1) {
            /**
             * If the firstStripeIndex was set after the lastStripeIndex the split lies entirely within a single stripe.
             * In case the split lies entirely within the last stripe, the firstStripeIndex will never be found, hence the
             * second condition.
             * In this case, the reader for this split will not read any data.
             * See {@link org.apache.orc.impl.RecordReaderImpl#RecordReaderImpl
             * Create a KeyInterval such that no delete delta records are loaded into memory in the deleteEventRegistry.
             */
            long minRowId = 1;
            long maxRowId = 0;
            int minBucketProp = 1;
            int maxBucketProp = 0;
            OrcRawRecordMerger.KeyInterval keyIntervalTmp = new OrcRawRecordMerger.KeyInterval(new RecordIdentifier(1, minBucketProp, minRowId), new RecordIdentifier(0, maxBucketProp, maxRowId));
            setSARG(keyIntervalTmp, deleteEventReaderOptions, minBucketProp, maxBucketProp, minRowId, maxRowId);
            LOG.info("findMinMaxKeys(): " + keyIntervalTmp + " stripes(" + firstStripeIndex + "," + lastStripeIndex + ")");
            return keyIntervalTmp;
        }
        if (firstStripeIndex == -1 || lastStripeIndex == -1) {
            // this should not happen but... if we don't know which stripe(s) are
            // involved we can't figure out min/max bounds
            LOG.warn("Could not find stripe (" + firstStripeIndex + "," + lastStripeIndex + ")");
            return new OrcRawRecordMerger.KeyInterval(null, null);
        }
        RecordIdentifier[] keyIndex = OrcRecordUpdater.parseKeyIndex(orcReaderData.orcTail);
        if (keyIndex == null) {
            LOG.warn("Could not find keyIndex (" + firstStripeIndex + "," + lastStripeIndex + "," + stripes.size() + ")");
        }
        if (keyIndex != null && keyIndex.length != stripes.size()) {
            LOG.warn("keyIndex length doesn't match (" + firstStripeIndex + "," + lastStripeIndex + "," + stripes.size() + "," + keyIndex.length + ")");
            return new OrcRawRecordMerger.KeyInterval(null, null);
        }
        /**
         * If {@link OrcConf.ROW_INDEX_STRIDE} is set to 0 all column stats on
         * ORC file are disabled though objects for them exist but and have
         * min/max set to MIN_LONG/MAX_LONG so we only use column stats if they
         * are actually computed.  Streaming ingest used to set it 0 and Minor
         * compaction so there are lots of legacy files with no (rather, bad)
         * column stats
         */
        boolean columnStatsPresent = orcReaderData.orcTail.getFooter().getRowIndexStride() > 0;
        if (!columnStatsPresent) {
            LOG.debug("findMinMaxKeys() No ORC column stats");
        }
        List<StripeStatistics> stats = orcReaderData.reader.getVariantStripeStatistics(null);
        assert stripes.size() == stats.size() : "str.s=" + stripes.size() + " sta.s=" + stats.size();
        RecordIdentifier minKey = null;
        if (firstStripeIndex > 0 && keyIndex != null) {
            // valid keys are strictly > than this key
            minKey = keyIndex[firstStripeIndex - 1];
            // add 1 to make comparison >= to match the case of 0th stripe
            minKey.setRowId(minKey.getRowId() + 1);
        } else {
            if (columnStatsPresent) {
                minKey = getKeyInterval(stats.get(firstStripeIndex).getColumnStatistics()).getMinKey();
            }
        }
        RecordIdentifier maxKey = null;
        if (keyIndex != null) {
            maxKey = keyIndex[lastStripeIndex];
        } else {
            if (columnStatsPresent) {
                maxKey = getKeyInterval(stats.get(lastStripeIndex).getColumnStatistics()).getMaxKey();
            }
        }
        OrcRawRecordMerger.KeyInterval keyInterval = new OrcRawRecordMerger.KeyInterval(minKey, maxKey);
        LOG.info("findMinMaxKeys(): " + keyInterval + " stripes(" + firstStripeIndex + "," + lastStripeIndex + ")");
        long minBucketProp = Long.MAX_VALUE, maxBucketProp = Long.MIN_VALUE;
        long minRowId = Long.MAX_VALUE, maxRowId = Long.MIN_VALUE;
        if (columnStatsPresent) {
            /**
             * figure out min/max bucket, rowid for push down.  This is different from
             * min/max ROW__ID because ROW__ID comparison uses dictionary order on two
             * tuples (a,b,c), but PPD can only do
             * (a between (x,y) and b between(x1,y1) and c between(x2,y2))
             * Consider:
             * (0,536936448,0), (0,536936448,2), (10000001,536936448,0)
             * 1st is min ROW_ID, 3r is max ROW_ID
             * and Delete events (0,536936448,2),....,(10000001,536936448,1000000)
             * So PPD based on min/max ROW_ID would have 0<= rowId <=0 which will
             * miss this delete event.  But we still want PPD to filter out data if
             * possible.
             *
             * So use stripe stats to find proper min/max for bucketProp and rowId
             * writeId is the same in both cases
             */
            for (int i = firstStripeIndex; i <= lastStripeIndex; i++) {
                OrcRawRecordMerger.KeyInterval key = getKeyInterval(stats.get(i).getColumnStatistics());
                if (key.getMinKey().getBucketProperty() < minBucketProp) {
                    minBucketProp = key.getMinKey().getBucketProperty();
                }
                if (key.getMaxKey().getBucketProperty() > maxBucketProp) {
                    maxBucketProp = key.getMaxKey().getBucketProperty();
                }
                if (key.getMinKey().getRowId() < minRowId) {
                    minRowId = key.getMinKey().getRowId();
                }
                if (key.getMaxKey().getRowId() > maxRowId) {
                    maxRowId = key.getMaxKey().getRowId();
                }
            }
        }
        if (minBucketProp == Long.MAX_VALUE)
            minBucketProp = Long.MIN_VALUE;
        if (maxBucketProp == Long.MIN_VALUE)
            maxBucketProp = Long.MAX_VALUE;
        if (minRowId == Long.MAX_VALUE)
            minRowId = Long.MIN_VALUE;
        if (maxRowId == Long.MIN_VALUE)
            maxRowId = Long.MAX_VALUE;
        setSARG(keyInterval, deleteEventReaderOptions, minBucketProp, maxBucketProp, minRowId, maxRowId);
        return keyInterval;
    }
}
Also used : StripeStatistics(org.apache.orc.StripeStatistics) RecordIdentifier(org.apache.hadoop.hive.ql.io.RecordIdentifier) StripeInformation(org.apache.orc.StripeInformation)

Example 15 with StripeInformation

use of org.apache.orc.StripeInformation in project hive by apache.

the class VectorizedOrcAcidRowBatchReader method findOriginalMinMaxKeys.

private OrcRawRecordMerger.KeyInterval findOriginalMinMaxKeys(OrcSplit orcSplit, OrcTail orcTail, Reader.Options deleteEventReaderOptions) {
    if (syntheticProps == null) {
        // If there aren't any delete delta files, then we don't need this anyway.
        return new OrcRawRecordMerger.KeyInterval(null, null);
    }
    long splitStart = orcSplit.getStart();
    long splitEnd = orcSplit.getStart() + orcSplit.getLength();
    long minRowId = syntheticProps.getRowIdOffset();
    long maxRowId = syntheticProps.getRowIdOffset();
    for (StripeInformation stripe : orcTail.getStripes()) {
        if (splitStart > stripe.getOffset()) {
            // This stripe starts before the current split starts. This stripe is not included in this split.
            minRowId += stripe.getNumberOfRows();
        }
        if (splitEnd > stripe.getOffset()) {
            // This stripe starts before the current split ends.
            maxRowId += stripe.getNumberOfRows();
        } else {
            // Remaining stripes are not included in this split.
            break;
        }
    }
    RecordIdentifier minKey = new RecordIdentifier(syntheticProps.getSyntheticWriteId(), syntheticProps.getBucketProperty(), minRowId);
    RecordIdentifier maxKey = new RecordIdentifier(syntheticProps.getSyntheticWriteId(), syntheticProps.getBucketProperty(), maxRowId > 0 ? maxRowId - 1 : 0);
    OrcRawRecordMerger.KeyInterval keyIntervalTmp = new OrcRawRecordMerger.KeyInterval(minKey, maxKey);
    if (minRowId >= maxRowId) {
        /**
         * The split lies entirely within a single stripe. In this case, the reader for this split will not read any data.
         * See {@link org.apache.orc.impl.RecordReaderImpl#RecordReaderImpl
         * We can return the min max key interval as is (it will not read any of the delete delta records into mem)
         */
        LOG.info("findOriginalMinMaxKeys(): This split starts and ends in the same stripe.");
    }
    LOG.info("findOriginalMinMaxKeys(): " + keyIntervalTmp);
    // Using min/max ROW__ID from original will work for ppd to the delete deltas because the writeid is the same in
    // the min and the max ROW__ID
    setSARG(keyIntervalTmp, deleteEventReaderOptions, minKey.getBucketProperty(), maxKey.getBucketProperty(), minKey.getRowId(), maxKey.getRowId());
    return keyIntervalTmp;
}
Also used : RecordIdentifier(org.apache.hadoop.hive.ql.io.RecordIdentifier) StripeInformation(org.apache.orc.StripeInformation)

Aggregations

StripeInformation (org.apache.orc.StripeInformation)30 Test (org.junit.Test)10 RecordIdentifier (org.apache.hadoop.hive.ql.io.RecordIdentifier)9 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)8 IntObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector)8 LongObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector)8 Path (org.apache.hadoop.fs.Path)7 OrcProto (org.apache.orc.OrcProto)7 ArrayList (java.util.ArrayList)6 Random (java.util.Random)6 OrcStripeMetadata (org.apache.hadoop.hive.llap.io.metadata.OrcStripeMetadata)5 ListObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector)5 MapObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector)5 BinaryObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector)5 BooleanObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector)5 ByteObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector)5 DoubleObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector)5 FloatObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector)5 HiveDecimalObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector)5 ShortObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector)5