Search in sources :

Example 1 with ColumnEncoding

use of org.apache.orc.OrcProto.ColumnEncoding in project hive by apache.

the class SerDeEncodedDataReader method combineCacheAndWriterEncodings.

private static List<ColumnEncoding> combineCacheAndWriterEncodings(ColumnEncoding[] cacheEncodings, List<ColumnEncoding> writerEncodings) throws IOException {
    // TODO: refactor with cache impl? it has the same merge logic
    if (cacheEncodings == null) {
        return new ArrayList<>(writerEncodings);
    }
    if (cacheEncodings.length != writerEncodings.size()) {
        throw new IOException("Incompatible encoding lengths: " + Arrays.toString(cacheEncodings) + " vs " + writerEncodings);
    }
    ColumnEncoding[] combinedEncodings = Arrays.copyOf(cacheEncodings, cacheEncodings.length);
    for (int colIx = 0; colIx < cacheEncodings.length; ++colIx) {
        ColumnEncoding newEncoding = writerEncodings.get(colIx);
        if (newEncoding == null)
            continue;
        if (combinedEncodings[colIx] != null && !newEncoding.equals(combinedEncodings[colIx])) {
            throw new IOException("Incompatible encodings at " + colIx + ": " + Arrays.toString(cacheEncodings) + " vs " + writerEncodings);
        }
        combinedEncodings[colIx] = newEncoding;
    }
    return Lists.newArrayList(combinedEncodings);
}
Also used : ColumnEncoding(org.apache.orc.OrcProto.ColumnEncoding) ArrayList(java.util.ArrayList) IOException(java.io.IOException)

Example 2 with ColumnEncoding

use of org.apache.orc.OrcProto.ColumnEncoding in project hive by apache.

the class SerDeEncodedDataReader method processOneSlice.

/** Unlike the other overload of processOneSlice, doesn't cache data. */
private boolean processOneSlice(Vectors diskData, boolean[] splitIncludes, int stripeIx, StripeData cacheData, long startTime) throws IOException {
    if (diskData == null) {
        // The other overload should have been used.
        throw new AssertionError();
    }
    // LlapIoImpl.LOG.debug("diskData " + diskData);
    logProcessOneSlice(stripeIx, diskData, cacheData);
    if (cacheData == null && diskData.getRowCount() == 0) {
        // Nothing to process.
        return true;
    }
    ColumnEncoding[] cacheEncodings = cacheData == null ? null : cacheData.getEncodings();
    LlapDataBuffer[][][] cacheBuffers = cacheData == null ? null : cacheData.getData();
    if (cacheData != null) {
        // Don't validate column count - no encodings for vectors.
        validateCacheAndDisk(cacheData, diskData.getRowCount(), -1, diskData);
    }
    SerDeStripeMetadata metadata = new SerDeStripeMetadata(stripeIx);
    metadata.setEncodings(Arrays.asList(cacheEncodings == null ? new ColumnEncoding[splitIncludes.length] : cacheEncodings));
    metadata.setRowCount(diskData.getRowCount());
    if (LlapIoImpl.LOG.isTraceEnabled()) {
        LlapIoImpl.LOG.trace("Derived stripe metadata for this split is " + metadata);
    }
    consumer.setStripeMetadata(metadata);
    OrcEncodedColumnBatch ecb = ECB_POOL.take();
    ecb.init(fileKey, metadata.getStripeIx(), OrcEncodedColumnBatch.ALL_RGS, writerIncludes.length);
    int vectorsIx = 0;
    for (int colIx = 0; colIx < writerIncludes.length; ++colIx) {
        if (!writerIncludes[colIx])
            continue;
        if (splitIncludes[colIx]) {
            // Skip the 0-th column, since it won't have a vector after reading the text source.
            if (colIx != 0) {
                List<ColumnVector> vectors = diskData.getVectors(vectorsIx++);
                if (LlapIoImpl.LOG.isTraceEnabled()) {
                    LlapIoImpl.LOG.trace("Processing vectors for column " + colIx + ": " + vectors);
                }
                ecb.initColumnWithVectors(colIx, vectors);
            } else {
                ecb.initColumn(0, OrcEncodedColumnBatch.MAX_DATA_STREAMS);
            }
        } else {
            ecb.initColumn(colIx, OrcEncodedColumnBatch.MAX_DATA_STREAMS);
            processColumnCacheData(cacheBuffers, ecb, colIx);
        }
    }
    if (processStop()) {
        recordReaderTime(startTime);
        return false;
    }
    return sendEcbToConsumer(ecb, cacheData != null, null);
}
Also used : ColumnEncoding(org.apache.orc.OrcProto.ColumnEncoding) OrcEncodedColumnBatch(org.apache.hadoop.hive.ql.io.orc.encoded.Reader.OrcEncodedColumnBatch) SerDeStripeMetadata(org.apache.hadoop.hive.llap.io.decode.GenericColumnVectorProducer.SerDeStripeMetadata) LlapDataBuffer(org.apache.hadoop.hive.llap.cache.LlapDataBuffer) ColumnVector(org.apache.hadoop.hive.ql.exec.vector.ColumnVector)

Example 3 with ColumnEncoding

use of org.apache.orc.OrcProto.ColumnEncoding in project hive by apache.

the class SerDeEncodedDataReader method createSliceToCache.

private StripeData createSliceToCache(CacheWriter.CacheStripeData diskData, StripeData cacheData) throws IOException {
    assert diskData != null;
    if (cacheData == null) {
        return new StripeData(diskData.knownTornStart, diskData.firstRowStart, diskData.lastRowStart, diskData.lastRowEnd, diskData.rowCount, diskData.encodings.toArray(new ColumnEncoding[diskData.encodings.size()]));
    } else {
        long rowCount = diskData.rowCount, encodingCount = diskData.encodings.size();
        validateCacheAndDisk(cacheData, rowCount, encodingCount, diskData);
        if (LlapIoImpl.LOG.isDebugEnabled()) {
            LlapIoImpl.LOG.debug("Creating slice to cache in addition to an existing slice " + cacheData.toCoordinateString() + "; disk offsets were " + diskData.toCoordinateString());
        }
        // Note: we could just do what we already do above from disk data, except for the validation
        // that is not strictly necessary, and knownTornStart which is an optimization.
        StripeData sliceToCache = StripeData.duplicateStructure(cacheData);
        for (int i = 0; i < diskData.encodings.size(); ++i) {
            sliceToCache.getEncodings()[i] = diskData.encodings.get(i);
        }
        sliceToCache.setKnownTornStart(Math.min(diskData.knownTornStart, sliceToCache.getKnownTornStart()));
        return sliceToCache;
    }
}
Also used : ColumnEncoding(org.apache.orc.OrcProto.ColumnEncoding) StripeData(org.apache.hadoop.hive.llap.cache.SerDeLowLevelCacheImpl.StripeData)

Example 4 with ColumnEncoding

use of org.apache.orc.OrcProto.ColumnEncoding in project hive by apache.

the class SerDeEncodedDataReader method processOneSlice.

private boolean processOneSlice(CacheWriter.CacheStripeData diskData, boolean[] splitIncludes, int stripeIx, StripeData cacheData, long startTime) throws IOException {
    logProcessOneSlice(stripeIx, diskData, cacheData);
    ColumnEncoding[] cacheEncodings = cacheData == null ? null : cacheData.getEncodings();
    LlapDataBuffer[][][] cacheBuffers = cacheData == null ? null : cacheData.getData();
    long cacheRowCount = cacheData == null ? -1L : cacheData.getRowCount();
    SerDeStripeMetadata metadata = new SerDeStripeMetadata(stripeIx);
    StripeData sliceToCache = null;
    boolean hasAllData = diskData == null;
    if (!hasAllData) {
        sliceToCache = createSliceToCache(diskData, cacheData);
        metadata.setEncodings(combineCacheAndWriterEncodings(cacheEncodings, diskData.encodings));
        metadata.setRowCount(diskData.rowCount);
    } else {
        metadata.setEncodings(Lists.newArrayList(cacheEncodings));
        metadata.setRowCount(cacheRowCount);
    }
    if (LlapIoImpl.LOG.isTraceEnabled()) {
        LlapIoImpl.LOG.trace("Derived stripe metadata for this split is " + metadata);
    }
    consumer.setStripeMetadata(metadata);
    OrcEncodedColumnBatch ecb = ECB_POOL.take();
    ecb.init(fileKey, metadata.getStripeIx(), OrcEncodedColumnBatch.ALL_RGS, writerIncludes.length);
    for (int colIx = 0; colIx < writerIncludes.length; ++colIx) {
        if (!writerIncludes[colIx])
            continue;
        ecb.initColumn(colIx, OrcEncodedColumnBatch.MAX_DATA_STREAMS);
        if (!hasAllData && splitIncludes[colIx]) {
            // The column has been read from disk.
            List<CacheWriter.CacheStreamData> streams = diskData.colStreams.get(colIx);
            LlapDataBuffer[][] newCacheDataForCol = createArrayToCache(sliceToCache, colIx, streams);
            // Struct column, such as root?
            if (streams == null)
                continue;
            Iterator<CacheWriter.CacheStreamData> iter = streams.iterator();
            while (iter.hasNext()) {
                CacheWriter.CacheStreamData stream = iter.next();
                if (stream.isSuppressed) {
                    if (LlapIoImpl.LOG.isTraceEnabled()) {
                        LlapIoImpl.LOG.trace("Removing a suppressed stream " + stream.name);
                    }
                    iter.remove();
                    discardUncachedBuffers(stream.data);
                    continue;
                }
                int streamIx = setStreamDataToCache(newCacheDataForCol, stream);
                ColumnStreamData cb = CSD_POOL.take();
                cb.incRef();
                cb.setCacheBuffers(stream.data);
                ecb.setStreamData(colIx, streamIx, cb);
            }
        } else {
            processColumnCacheData(cacheBuffers, ecb, colIx);
        }
    }
    if (processStop()) {
        recordReaderTime(startTime);
        return false;
    }
    //       but for now just rely on the cache put to lock them before we send them over.
    if (LlapIoImpl.CACHE_LOGGER.isTraceEnabled()) {
        LlapIoImpl.CACHE_LOGGER.trace("Data to cache from the read " + sliceToCache);
    }
    cacheFileData(sliceToCache);
    return sendEcbToConsumer(ecb, cacheData != null, diskData);
}
Also used : StripeData(org.apache.hadoop.hive.llap.cache.SerDeLowLevelCacheImpl.StripeData) SerDeStripeMetadata(org.apache.hadoop.hive.llap.io.decode.GenericColumnVectorProducer.SerDeStripeMetadata) LlapDataBuffer(org.apache.hadoop.hive.llap.cache.LlapDataBuffer) ColumnEncoding(org.apache.orc.OrcProto.ColumnEncoding) OrcEncodedColumnBatch(org.apache.hadoop.hive.ql.io.orc.encoded.Reader.OrcEncodedColumnBatch) CacheWriter(org.apache.hadoop.hive.llap.io.encoded.SerDeEncodedDataReader.CacheWriter) ColumnStreamData(org.apache.hadoop.hive.common.io.encoded.EncodedColumnBatch.ColumnStreamData)

Example 5 with ColumnEncoding

use of org.apache.orc.OrcProto.ColumnEncoding in project hive by apache.

the class SerDeEncodedDataReader method cacheFileData.

public void cacheFileData(StripeData sd) {
    if (sd == null || sd.getEncodings() == null)
        return;
    if (fileKey != null) {
        // Note that we cache each slice separately. We could cache them together at the end, but
        // then we won't be able to pass them to users without inc-refing explicitly.
        ColumnEncoding[] encodings = sd.getEncodings();
        for (int i = 0; i < encodings.length; ++i) {
            // Make data consistent with encodings, don't store useless information.
            if (sd.getData()[i] == null) {
                encodings[i] = null;
            } else if (encodings[i] == null) {
                throw new AssertionError("Caching data without an encoding at " + i + ": " + sd);
            }
        }
        FileData fd = new FileData(fileKey, encodings.length);
        fd.addStripe(sd);
        cache.putFileData(fd, Priority.NORMAL, counters);
    } else {
        lockAllBuffers(sd);
    }
// We assume that if put/lock throws in the middle, it's ok to treat buffers as not being
// locked and to blindly deallocate them, since they are not going to be used. Therefore
// we don't remove them from the cleanup list - we will do it after sending to consumer.
// This relies on sequence of calls to cacheFileData and sendEcb..
}
Also used : ColumnEncoding(org.apache.orc.OrcProto.ColumnEncoding) FileData(org.apache.hadoop.hive.llap.cache.SerDeLowLevelCacheImpl.FileData)

Aggregations

ColumnEncoding (org.apache.orc.OrcProto.ColumnEncoding)6 StripeData (org.apache.hadoop.hive.llap.cache.SerDeLowLevelCacheImpl.StripeData)3 LlapDataBuffer (org.apache.hadoop.hive.llap.cache.LlapDataBuffer)2 SerDeStripeMetadata (org.apache.hadoop.hive.llap.io.decode.GenericColumnVectorProducer.SerDeStripeMetadata)2 CacheWriter (org.apache.hadoop.hive.llap.io.encoded.SerDeEncodedDataReader.CacheWriter)2 OrcEncodedColumnBatch (org.apache.hadoop.hive.ql.io.orc.encoded.Reader.OrcEncodedColumnBatch)2 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 ColumnStreamData (org.apache.hadoop.hive.common.io.encoded.EncodedColumnBatch.ColumnStreamData)1 FileData (org.apache.hadoop.hive.llap.cache.SerDeLowLevelCacheImpl.FileData)1 ColumnVector (org.apache.hadoop.hive.ql.exec.vector.ColumnVector)1