Search in sources :

Example 1 with SerDeStripeMetadata

use of org.apache.hadoop.hive.llap.io.decode.GenericColumnVectorProducer.SerDeStripeMetadata in project hive by apache.

the class SerDeEncodedDataReader method processOneSlice.

/** Unlike the other overload of processOneSlice, doesn't cache data. */
private boolean processOneSlice(Vectors diskData, boolean[] splitIncludes, int stripeIx, StripeData cacheData, long startTime) throws IOException {
    if (diskData == null) {
        // The other overload should have been used.
        throw new AssertionError();
    }
    // LlapIoImpl.LOG.debug("diskData " + diskData);
    logProcessOneSlice(stripeIx, diskData, cacheData);
    if (cacheData == null && diskData.getRowCount() == 0) {
        // Nothing to process.
        return true;
    }
    ColumnEncoding[] cacheEncodings = cacheData == null ? null : cacheData.getEncodings();
    LlapDataBuffer[][][] cacheBuffers = cacheData == null ? null : cacheData.getData();
    if (cacheData != null) {
        // Don't validate column count - no encodings for vectors.
        validateCacheAndDisk(cacheData, diskData.getRowCount(), -1, diskData);
    }
    SerDeStripeMetadata metadata = new SerDeStripeMetadata(stripeIx);
    metadata.setEncodings(Arrays.asList(cacheEncodings == null ? new ColumnEncoding[splitIncludes.length] : cacheEncodings));
    metadata.setRowCount(diskData.getRowCount());
    if (LlapIoImpl.LOG.isTraceEnabled()) {
        LlapIoImpl.LOG.trace("Derived stripe metadata for this split is " + metadata);
    }
    consumer.setStripeMetadata(metadata);
    OrcEncodedColumnBatch ecb = ECB_POOL.take();
    ecb.init(fileKey, metadata.getStripeIx(), OrcEncodedColumnBatch.ALL_RGS, writerIncludes.length);
    int vectorsIx = 0;
    for (int colIx = 0; colIx < writerIncludes.length; ++colIx) {
        if (!writerIncludes[colIx])
            continue;
        if (splitIncludes[colIx]) {
            // Skip the 0-th column, since it won't have a vector after reading the text source.
            if (colIx != 0) {
                List<ColumnVector> vectors = diskData.getVectors(vectorsIx++);
                if (LlapIoImpl.LOG.isTraceEnabled()) {
                    LlapIoImpl.LOG.trace("Processing vectors for column " + colIx + ": " + vectors);
                }
                ecb.initColumnWithVectors(colIx, vectors);
            } else {
                ecb.initColumn(0, OrcEncodedColumnBatch.MAX_DATA_STREAMS);
            }
        } else {
            ecb.initColumn(colIx, OrcEncodedColumnBatch.MAX_DATA_STREAMS);
            processColumnCacheData(cacheBuffers, ecb, colIx);
        }
    }
    if (processStop()) {
        recordReaderTime(startTime);
        return false;
    }
    return sendEcbToConsumer(ecb, cacheData != null, null);
}
Also used : ColumnEncoding(org.apache.orc.OrcProto.ColumnEncoding) OrcEncodedColumnBatch(org.apache.hadoop.hive.ql.io.orc.encoded.Reader.OrcEncodedColumnBatch) SerDeStripeMetadata(org.apache.hadoop.hive.llap.io.decode.GenericColumnVectorProducer.SerDeStripeMetadata) LlapDataBuffer(org.apache.hadoop.hive.llap.cache.LlapDataBuffer) ColumnVector(org.apache.hadoop.hive.ql.exec.vector.ColumnVector)

Example 2 with SerDeStripeMetadata

use of org.apache.hadoop.hive.llap.io.decode.GenericColumnVectorProducer.SerDeStripeMetadata in project hive by apache.

the class SerDeEncodedDataReader method processOneSlice.

private boolean processOneSlice(CacheWriter.CacheStripeData diskData, boolean[] splitIncludes, int stripeIx, StripeData cacheData, long startTime) throws IOException {
    logProcessOneSlice(stripeIx, diskData, cacheData);
    ColumnEncoding[] cacheEncodings = cacheData == null ? null : cacheData.getEncodings();
    LlapDataBuffer[][][] cacheBuffers = cacheData == null ? null : cacheData.getData();
    long cacheRowCount = cacheData == null ? -1L : cacheData.getRowCount();
    SerDeStripeMetadata metadata = new SerDeStripeMetadata(stripeIx);
    StripeData sliceToCache = null;
    boolean hasAllData = diskData == null;
    if (!hasAllData) {
        sliceToCache = createSliceToCache(diskData, cacheData);
        metadata.setEncodings(combineCacheAndWriterEncodings(cacheEncodings, diskData.encodings));
        metadata.setRowCount(diskData.rowCount);
    } else {
        metadata.setEncodings(Lists.newArrayList(cacheEncodings));
        metadata.setRowCount(cacheRowCount);
    }
    if (LlapIoImpl.LOG.isTraceEnabled()) {
        LlapIoImpl.LOG.trace("Derived stripe metadata for this split is " + metadata);
    }
    consumer.setStripeMetadata(metadata);
    OrcEncodedColumnBatch ecb = ECB_POOL.take();
    ecb.init(fileKey, metadata.getStripeIx(), OrcEncodedColumnBatch.ALL_RGS, writerIncludes.length);
    for (int colIx = 0; colIx < writerIncludes.length; ++colIx) {
        if (!writerIncludes[colIx])
            continue;
        ecb.initColumn(colIx, OrcEncodedColumnBatch.MAX_DATA_STREAMS);
        if (!hasAllData && splitIncludes[colIx]) {
            // The column has been read from disk.
            List<CacheWriter.CacheStreamData> streams = diskData.colStreams.get(colIx);
            LlapDataBuffer[][] newCacheDataForCol = createArrayToCache(sliceToCache, colIx, streams);
            // Struct column, such as root?
            if (streams == null)
                continue;
            Iterator<CacheWriter.CacheStreamData> iter = streams.iterator();
            while (iter.hasNext()) {
                CacheWriter.CacheStreamData stream = iter.next();
                if (stream.isSuppressed) {
                    if (LlapIoImpl.LOG.isTraceEnabled()) {
                        LlapIoImpl.LOG.trace("Removing a suppressed stream " + stream.name);
                    }
                    iter.remove();
                    discardUncachedBuffers(stream.data);
                    continue;
                }
                int streamIx = setStreamDataToCache(newCacheDataForCol, stream);
                ColumnStreamData cb = CSD_POOL.take();
                cb.incRef();
                cb.setCacheBuffers(stream.data);
                ecb.setStreamData(colIx, streamIx, cb);
            }
        } else {
            processColumnCacheData(cacheBuffers, ecb, colIx);
        }
    }
    if (processStop()) {
        recordReaderTime(startTime);
        return false;
    }
    //       but for now just rely on the cache put to lock them before we send them over.
    if (LlapIoImpl.CACHE_LOGGER.isTraceEnabled()) {
        LlapIoImpl.CACHE_LOGGER.trace("Data to cache from the read " + sliceToCache);
    }
    cacheFileData(sliceToCache);
    return sendEcbToConsumer(ecb, cacheData != null, diskData);
}
Also used : StripeData(org.apache.hadoop.hive.llap.cache.SerDeLowLevelCacheImpl.StripeData) SerDeStripeMetadata(org.apache.hadoop.hive.llap.io.decode.GenericColumnVectorProducer.SerDeStripeMetadata) LlapDataBuffer(org.apache.hadoop.hive.llap.cache.LlapDataBuffer) ColumnEncoding(org.apache.orc.OrcProto.ColumnEncoding) OrcEncodedColumnBatch(org.apache.hadoop.hive.ql.io.orc.encoded.Reader.OrcEncodedColumnBatch) CacheWriter(org.apache.hadoop.hive.llap.io.encoded.SerDeEncodedDataReader.CacheWriter) ColumnStreamData(org.apache.hadoop.hive.common.io.encoded.EncodedColumnBatch.ColumnStreamData)

Aggregations

LlapDataBuffer (org.apache.hadoop.hive.llap.cache.LlapDataBuffer)2 SerDeStripeMetadata (org.apache.hadoop.hive.llap.io.decode.GenericColumnVectorProducer.SerDeStripeMetadata)2 OrcEncodedColumnBatch (org.apache.hadoop.hive.ql.io.orc.encoded.Reader.OrcEncodedColumnBatch)2 ColumnEncoding (org.apache.orc.OrcProto.ColumnEncoding)2 ColumnStreamData (org.apache.hadoop.hive.common.io.encoded.EncodedColumnBatch.ColumnStreamData)1 StripeData (org.apache.hadoop.hive.llap.cache.SerDeLowLevelCacheImpl.StripeData)1 CacheWriter (org.apache.hadoop.hive.llap.io.encoded.SerDeEncodedDataReader.CacheWriter)1 ColumnVector (org.apache.hadoop.hive.ql.exec.vector.ColumnVector)1