Search in sources :

Example 1 with StripeInformation

use of org.apache.orc.StripeInformation in project hive by apache.

the class OrcEncodedDataReader method determineRgsToRead.

/**
   * Determines which RGs need to be read, after stripes have been determined.
   * SARG is applied, and readState is populated for each stripe accordingly.
   */
private boolean determineRgsToRead(boolean[] globalIncludes, int rowIndexStride, ArrayList<OrcStripeMetadata> metadata) throws IOException {
    RecordReaderImpl.SargApplier sargApp = null;
    if (sarg != null && rowIndexStride != 0) {
        List<OrcProto.Type> types = fileMetadata.getTypes();
        String[] colNamesForSarg = OrcInputFormat.getSargColumnNames(columnNames, types, globalIncludes, fileMetadata.isOriginalFormat());
        sargApp = new RecordReaderImpl.SargApplier(sarg, colNamesForSarg, rowIndexStride, evolution, OrcFile.WriterVersion.from(fileMetadata.getWriterVersionNum()));
    }
    boolean hasAnyData = false;
    // readState should have been initialized by this time with an empty array.
    for (int stripeIxMod = 0; stripeIxMod < readState.length; ++stripeIxMod) {
        int stripeIx = stripeIxMod + stripeIxFrom;
        StripeInformation stripe = fileMetadata.getStripes().get(stripeIx);
        int rgCount = getRgCount(stripe, rowIndexStride);
        boolean[] rgsToRead = null;
        if (sargApp != null) {
            OrcStripeMetadata stripeMetadata = metadata.get(stripeIxMod);
            rgsToRead = sargApp.pickRowGroups(stripe, stripeMetadata.getRowIndexes(), stripeMetadata.getBloomFilterKinds(), stripeMetadata.getBloomFilterIndexes(), true);
        }
        boolean isNone = rgsToRead == RecordReaderImpl.SargApplier.READ_NO_RGS, isAll = rgsToRead == RecordReaderImpl.SargApplier.READ_ALL_RGS;
        hasAnyData = hasAnyData || !isNone;
        if (LlapIoImpl.ORC_LOGGER.isTraceEnabled()) {
            if (isNone) {
                LlapIoImpl.ORC_LOGGER.trace("SARG eliminated all RGs for stripe {}", stripeIx);
            } else if (!isAll) {
                LlapIoImpl.ORC_LOGGER.trace("SARG picked RGs for stripe {}: {}", stripeIx, DebugUtils.toString(rgsToRead));
            } else {
                LlapIoImpl.ORC_LOGGER.trace("Will read all {} RGs for stripe {}", rgCount, stripeIx);
            }
        }
        assert isAll || isNone || rgsToRead.length == rgCount;
        int fileIncludesCount = 0;
        //        We don't need separate readState w/o HL cache, should get rid of that instead.
        for (int includeIx = 1; includeIx < globalIncludes.length; ++includeIx) {
            fileIncludesCount += (globalIncludes[includeIx] ? 1 : 0);
        }
        readState[stripeIxMod] = new boolean[fileIncludesCount][];
        for (int includeIx = 0; includeIx < fileIncludesCount; ++includeIx) {
            readState[stripeIxMod][includeIx] = (isAll || isNone) ? rgsToRead : Arrays.copyOf(rgsToRead, rgsToRead.length);
        }
        adjustRgMetric(rgCount, rgsToRead, isNone, isAll);
    }
    return hasAnyData;
}
Also used : RecordReaderImpl(org.apache.hadoop.hive.ql.io.orc.RecordReaderImpl) StripeInformation(org.apache.orc.StripeInformation) OrcStripeMetadata(org.apache.hadoop.hive.llap.io.metadata.OrcStripeMetadata)

Example 2 with StripeInformation

use of org.apache.orc.StripeInformation in project hive by apache.

the class TestIncrementalObjectSizeEstimator method testMetadata.

@Test
public void testMetadata() throws IOException {
    // Mostly tests that it doesn't crash.
    OrcStripeMetadata osm = OrcStripeMetadata.createDummy(0);
    HashMap<Class<?>, ObjectEstimator> map = IncrementalObjectSizeEstimator.createEstimators(osm);
    IncrementalObjectSizeEstimator.addEstimator("com.google.protobuf.LiteralByteString", map);
    ObjectEstimator root = map.get(OrcStripeMetadata.class);
    LOG.info("Estimated " + root.estimate(osm, map) + " for a dummy OSM");
    OrcBatchKey stripeKey = null;
    DummyMetadataReader mr = new DummyMetadataReader();
    mr.doStreamStep = false;
    mr.isEmpty = true;
    StripeInformation si = Mockito.mock(StripeInformation.class);
    Mockito.when(si.getNumberOfRows()).thenReturn(0L);
    osm = new OrcStripeMetadata(stripeKey, mr, si, null, null, null, null);
    LOG.info("Estimated " + root.estimate(osm, map) + " for an empty OSM");
    mr.doStreamStep = true;
    osm = new OrcStripeMetadata(stripeKey, mr, si, null, null, null, null);
    LOG.info("Estimated " + root.estimate(osm, map) + " for an empty OSM after serde");
    mr.isEmpty = false;
    stripeKey = new OrcBatchKey(0, 0, 0);
    osm = new OrcStripeMetadata(stripeKey, mr, si, null, null, null, null);
    LOG.info("Estimated " + root.estimate(osm, map) + " for a test OSM");
    osm.resetRowIndex();
    LOG.info("Estimated " + root.estimate(osm, map) + " for a test OSM w/o row index");
    mr.doStreamStep = true;
    osm = new OrcStripeMetadata(stripeKey, mr, si, null, null, null, null);
    LOG.info("Estimated " + root.estimate(osm, map) + " for a test OSM after serde");
    osm.resetRowIndex();
    LOG.info("Estimated " + root.estimate(osm, map) + " for a test OSM w/o row index after serde");
    OrcFileMetadata ofm = OrcFileMetadata.createDummy(0);
    map = IncrementalObjectSizeEstimator.createEstimators(ofm);
    IncrementalObjectSizeEstimator.addEstimator("com.google.protobuf.LiteralByteString", map);
    root = map.get(OrcFileMetadata.class);
    LOG.info("Estimated " + root.estimate(ofm, map) + " for a dummy OFM");
}
Also used : OrcFileMetadata(org.apache.hadoop.hive.llap.io.metadata.OrcFileMetadata) ObjectEstimator(org.apache.hadoop.hive.llap.IncrementalObjectSizeEstimator.ObjectEstimator) OrcBatchKey(org.apache.hadoop.hive.ql.io.orc.encoded.OrcBatchKey) StripeInformation(org.apache.orc.StripeInformation) OrcStripeMetadata(org.apache.hadoop.hive.llap.io.metadata.OrcStripeMetadata) Test(org.junit.Test)

Example 3 with StripeInformation

use of org.apache.orc.StripeInformation in project hive by apache.

the class OrcFileFormatProxy method applySargToMetadata.

@Override
public SplitInfos applySargToMetadata(SearchArgument sarg, ByteBuffer fileMetadata) throws IOException {
    // TODO: ideally we should store shortened representation of only the necessary fields
    // in HBase; it will probably require custom SARG application code.
    OrcTail orcTail = ReaderImpl.extractFileTail(fileMetadata);
    OrcProto.Footer footer = orcTail.getFooter();
    int stripeCount = footer.getStripesCount();
    boolean[] result = OrcInputFormat.pickStripesViaTranslatedSarg(sarg, orcTail.getWriterVersion(), footer.getTypesList(), orcTail.getStripeStatistics(), stripeCount);
    // For ORC case, send the boundaries of the stripes so we don't have to send the footer.
    SplitInfos.Builder sb = SplitInfos.newBuilder();
    List<StripeInformation> stripes = orcTail.getStripes();
    boolean isEliminated = true;
    for (int i = 0; i < result.length; ++i) {
        if (result != null && !result[i])
            continue;
        isEliminated = false;
        StripeInformation si = stripes.get(i);
        if (LOG.isDebugEnabled()) {
            LOG.debug("PPD is adding a split " + i + ": " + si.getOffset() + ", " + si.getLength());
        }
        sb.addInfos(SplitInfo.newBuilder().setIndex(i).setOffset(si.getOffset()).setLength(si.getLength()));
    }
    return isEliminated ? null : sb.build();
}
Also used : OrcProto(org.apache.orc.OrcProto) SplitInfos(org.apache.hadoop.hive.metastore.Metastore.SplitInfos) StripeInformation(org.apache.orc.StripeInformation) OrcTail(org.apache.orc.impl.OrcTail)

Example 4 with StripeInformation

use of org.apache.orc.StripeInformation in project flink by apache.

the class OrcShimV200 method getOffsetAndLengthForSplit.

@VisibleForTesting
public static Tuple2<Long, Long> getOffsetAndLengthForSplit(long splitStart, long splitLength, List<StripeInformation> stripes) {
    long splitEnd = splitStart + splitLength;
    long readStart = Long.MAX_VALUE;
    long readEnd = Long.MIN_VALUE;
    for (StripeInformation s : stripes) {
        if (splitStart <= s.getOffset() && s.getOffset() < splitEnd) {
            // stripe starts in split, so it is included
            readStart = Math.min(readStart, s.getOffset());
            readEnd = Math.max(readEnd, s.getOffset() + s.getLength());
        }
    }
    if (readStart < Long.MAX_VALUE) {
        // at least one split is included
        return Tuple2.of(readStart, readEnd - readStart);
    } else {
        return Tuple2.of(0L, 0L);
    }
}
Also used : StripeInformation(org.apache.orc.StripeInformation) VisibleForTesting(org.apache.flink.annotation.VisibleForTesting)

Example 5 with StripeInformation

use of org.apache.orc.StripeInformation in project hive by apache.

the class OrcEncodedDataReader method getFileFooterFromCacheOrDisk.

/**
 *  Gets file metadata for the split from cache, or reads it from the file.
 */
private OrcFileMetadata getFileFooterFromCacheOrDisk() throws IOException {
    LlapBufferOrBuffers tailBuffers = null;
    List<StripeStatistics> stats = null;
    List<StripeInformation> stripes = null;
    boolean hasCache = fileKey != null && metadataCache != null;
    if (hasCache) {
        tailBuffers = metadataCache.getFileMetadata(fileKey);
        if (tailBuffers != null) {
            try {
                OrcTail orcTail = getOrcTailFromLlapBuffers(tailBuffers);
                counters.incrCounter(LlapIOCounters.METADATA_CACHE_HIT);
                FileTail tail = orcTail.getFileTail();
                stats = getStripeStatsFromOrcTail(orcTail);
                stripes = new ArrayList<>(tail.getFooter().getStripesCount());
                int stripeIdx = 0;
                for (OrcProto.StripeInformation stripeProto : tail.getFooter().getStripesList()) {
                    stripes.add(new ReaderImpl.StripeInformationImpl(stripeProto, stripeIdx++, -1, null));
                }
                return new OrcFileMetadata(fileKey, tail.getFooter(), tail.getPostscript(), stats, stripes, ReaderImpl.getFileVersion(tail.getPostscript().getVersionList()));
            } finally {
                // We don't need the buffer anymore.
                metadataCache.decRefBuffer(tailBuffers);
            }
        } else {
            counters.incrCounter(LlapIOCounters.METADATA_CACHE_MISS);
            throwIfCacheOnlyRead(isReadCacheOnly);
        }
    }
    ensureOrcReader();
    ByteBuffer tailBufferBb = orcReader.getSerializedFileFooter();
    if (hasCache) {
        tailBuffers = metadataCache.putFileMetadata(fileKey, tailBufferBb, cacheTag, isStopped);
        // We don't use the cache's copy of the buffer.
        metadataCache.decRefBuffer(tailBuffers);
    }
    FileTail ft = orcReader.getFileTail();
    return new OrcFileMetadata(fileKey, ft.getFooter(), ft.getPostscript(), orcReader.getOrcProtoStripeStatistics(), orcReader.getStripes(), orcReader.getFileVersion());
}
Also used : OrcFileMetadata(org.apache.hadoop.hive.llap.io.metadata.OrcFileMetadata) OrcProto(org.apache.orc.OrcProto) StripeStatistics(org.apache.orc.OrcProto.StripeStatistics) RecordReaderImpl(org.apache.hadoop.hive.ql.io.orc.RecordReaderImpl) ReaderImpl(org.apache.orc.impl.ReaderImpl) ByteBuffer(java.nio.ByteBuffer) FileTail(org.apache.orc.OrcProto.FileTail) LlapBufferOrBuffers(org.apache.hadoop.hive.llap.io.metadata.MetadataCache.LlapBufferOrBuffers) StripeInformation(org.apache.orc.StripeInformation) OrcTail(org.apache.orc.impl.OrcTail)

Aggregations

StripeInformation (org.apache.orc.StripeInformation)30 Test (org.junit.Test)10 RecordIdentifier (org.apache.hadoop.hive.ql.io.RecordIdentifier)9 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)8 IntObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector)8 LongObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector)8 Path (org.apache.hadoop.fs.Path)7 OrcProto (org.apache.orc.OrcProto)7 ArrayList (java.util.ArrayList)6 Random (java.util.Random)6 OrcStripeMetadata (org.apache.hadoop.hive.llap.io.metadata.OrcStripeMetadata)5 ListObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector)5 MapObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector)5 BinaryObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector)5 BooleanObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector)5 ByteObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector)5 DoubleObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector)5 FloatObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector)5 HiveDecimalObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector)5 ShortObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector)5