Search in sources :

Example 1 with StripeData

use of org.apache.hadoop.hive.llap.cache.SerDeLowLevelCacheImpl.StripeData in project hive by apache.

the class SerDeEncodedDataReader method performDataRead.

protected Void performDataRead() throws IOException, InterruptedException {
    boolean isOk = false;
    try {
        try {
            long startTime = counters.startTimeCounter();
            LlapIoImpl.LOG.info("Processing data for {}", split.getPath());
            if (processStop()) {
                recordReaderTime(startTime);
                return null;
            }
            Boolean isFromCache = null;
            try {
                isFromCache = readFileWithCache(startTime);
            } finally {
                // so we expect to have stuff remaining in there only in case of errors.
                if (cachedData != null && cachedData.getData() != null) {
                    for (StripeData sd : cachedData.getData()) {
                        unlockAllBuffers(sd);
                    }
                    cachedData = null;
                }
            }
            // Stop requested, and handled inside.
            if (isFromCache == null)
                return null;
            if (!isFromCache) {
                if (!processOneFileSplit(split, startTime, Ref.from(0), null))
                    return null;
            }
            // Done with all the things.
            recordReaderTime(startTime);
            if (LlapIoImpl.LOG.isTraceEnabled()) {
                LlapIoImpl.LOG.trace("done processing {}", split);
            }
        } catch (Throwable e) {
            LlapIoImpl.LOG.error("Exception while processing", e);
            consumer.setError(e);
            throw e;
        }
        consumer.setDone();
        isOk = true;
        return null;
    } finally {
        cleanup(!isOk);
    // Do not clean up the writers - the callback should do it.
    }
}
Also used : StripeData(org.apache.hadoop.hive.llap.cache.SerDeLowLevelCacheImpl.StripeData) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean)

Example 2 with StripeData

use of org.apache.hadoop.hive.llap.cache.SerDeLowLevelCacheImpl.StripeData in project hive by apache.

the class SerDeEncodedDataReader method readFileWithCache.

public Boolean readFileWithCache(long startTime) throws IOException, InterruptedException {
    if (fileKey == null)
        return false;
    BooleanRef gotAllData = new BooleanRef();
    long endOfSplit = split.getStart() + split.getLength();
    this.cachedData = cache.getFileData(fileKey, split.getStart(), endOfSplit, writerIncludes, CC_FACTORY, counters, gotAllData);
    if (!gotAllData.value) {
        throwIfCacheOnlyRead(isReadCacheOnly);
    }
    if (cachedData == null) {
        if (LlapIoImpl.CACHE_LOGGER.isTraceEnabled()) {
            LlapIoImpl.CACHE_LOGGER.trace("No data for the split found in cache");
        }
        return false;
    }
    String[] hosts = extractHosts(split, false), inMemoryHosts = extractHosts(split, true);
    List<StripeData> slices = cachedData.getData();
    if (slices.isEmpty())
        return false;
    long uncachedPrefixEnd = slices.get(0).getKnownTornStart(), uncachedSuffixStart = slices.get(slices.size() - 1).getLastEnd(), lastStripeLastStart = slices.get(slices.size() - 1).getLastStart();
    Ref<Integer> stripeIx = Ref.from(0);
    if (uncachedPrefixEnd > split.getStart()) {
        // TODO: can we merge neighboring splits? So we don't init so many readers.
        FileSplit sliceSplit = new FileSplit(split.getPath(), split.getStart(), uncachedPrefixEnd - split.getStart(), hosts, inMemoryHosts);
        if (!processOneFileSplit(sliceSplit, startTime, stripeIx, null))
            return null;
    }
    while (!slices.isEmpty()) {
        StripeData slice = slices.get(0);
        long start = slice.getKnownTornStart();
        // Will also read the last row.
        long len = slice.getLastStart() - start;
        FileSplit sliceSplit = new FileSplit(split.getPath(), start, len, hosts, inMemoryHosts);
        if (!processOneFileSplit(sliceSplit, startTime, stripeIx, slice))
            return null;
    }
    boolean isUnfortunate = false;
    if (uncachedSuffixStart == endOfSplit) {
        // This is rather obscure. The end of last row cached is precisely at the split end offset.
        // If the split is in the middle of the file, LRR would read one more row after that,
        // therefore as unfortunate as it is, we have to do a one-row read. However, for that to
        // have happened, someone should have supplied a split that ends inside the last row, i.e.
        // a few bytes earlier than the current split, which is pretty unlikely. What is more likely
        // is that the split, and the last row, both end at the end of file. Check for this.
        long size = split.getPath().getFileSystem(daemonConf).getFileStatus(split.getPath()).getLen();
        isUnfortunate = size > endOfSplit;
        if (isUnfortunate) {
            // Log at warn, given how unfortunate this is.
            LlapIoImpl.LOG.warn("One-row mismatch at the end of split " + split.getPath() + " at " + endOfSplit + "; file size is " + size);
        }
    }
    if (uncachedSuffixStart < endOfSplit || isUnfortunate) {
        // Note: we assume 0-length split is correct given now LRR interprets offsets (reading an
        // extra row). Should we instead assume 1+ chars and add 1 for isUnfortunate?
        // Do not read from uncachedSuffixStart as LineRecordReader skips first row
        FileSplit splitPart = new FileSplit(split.getPath(), lastStripeLastStart, endOfSplit - lastStripeLastStart, hosts, inMemoryHosts);
        if (!processOneFileSplit(splitPart, startTime, stripeIx, null))
            return null;
    }
    return true;
}
Also used : StripeData(org.apache.hadoop.hive.llap.cache.SerDeLowLevelCacheImpl.StripeData) BooleanRef(org.apache.hadoop.hive.common.io.DataCache.BooleanRef) FileSplit(org.apache.hadoop.mapred.FileSplit)

Example 3 with StripeData

use of org.apache.hadoop.hive.llap.cache.SerDeLowLevelCacheImpl.StripeData in project hive by apache.

the class SerDeEncodedDataReader method processAsyncCacheData.

private void processAsyncCacheData(CacheWriter.CacheStripeData diskData, boolean[] splitIncludes) throws IOException {
    StripeData sliceToCache = new StripeData(diskData.knownTornStart, diskData.firstRowStart, diskData.lastRowStart, diskData.lastRowEnd, diskData.rowCount, diskData.encodings.toArray(new ColumnEncoding[diskData.encodings.size()]));
    for (int colIx = 0; colIx < splitIncludes.length; ++colIx) {
        if (!splitIncludes[colIx])
            continue;
        // The column has been read from disk.
        List<CacheWriter.CacheStreamData> streams = diskData.colStreams.get(colIx);
        LlapSerDeDataBuffer[][] newCacheDataForCol = createArrayToCache(sliceToCache, colIx, streams);
        // Struct column, such as root?
        if (streams == null)
            continue;
        Iterator<CacheWriter.CacheStreamData> iter = streams.iterator();
        while (iter.hasNext()) {
            CacheWriter.CacheStreamData stream = iter.next();
            if (stream.isSuppressed) {
                if (LlapIoImpl.LOG.isTraceEnabled()) {
                    LlapIoImpl.LOG.trace("Removing a suppressed stream " + stream.name);
                }
                iter.remove();
                discardUncachedBuffers(stream.data);
                continue;
            }
            setStreamDataToCache(newCacheDataForCol, stream);
        }
    }
    if (LlapIoImpl.CACHE_LOGGER.isTraceEnabled()) {
        LlapIoImpl.CACHE_LOGGER.trace("Data to cache from async read " + sliceToCache);
    }
    try {
        cacheFileData(sliceToCache);
    } finally {
        unlockAllBuffers(sliceToCache);
    }
}
Also used : ColumnEncoding(org.apache.orc.OrcProto.ColumnEncoding) StripeData(org.apache.hadoop.hive.llap.cache.SerDeLowLevelCacheImpl.StripeData)

Example 4 with StripeData

use of org.apache.hadoop.hive.llap.cache.SerDeLowLevelCacheImpl.StripeData in project hive by apache.

the class SerDeEncodedDataReader method processOneSlice.

private boolean processOneSlice(CacheWriter.CacheStripeData diskData, boolean[] splitIncludes, int stripeIx, StripeData cacheData, long startTime) throws IOException, InterruptedException {
    logProcessOneSlice(stripeIx, diskData, cacheData);
    ColumnEncoding[] cacheEncodings = cacheData == null ? null : cacheData.getEncodings();
    LlapSerDeDataBuffer[][][] cacheBuffers = cacheData == null ? null : cacheData.getData();
    long cacheRowCount = cacheData == null ? -1L : cacheData.getRowCount();
    SerDeStripeMetadata metadata = new SerDeStripeMetadata(stripeIx);
    StripeData sliceToCache = null;
    boolean hasAllData = diskData == null;
    if (!hasAllData) {
        sliceToCache = createSliceToCache(diskData, cacheData);
        metadata.setEncodings(combineCacheAndWriterEncodings(cacheEncodings, diskData.encodings));
        metadata.setRowCount(diskData.rowCount);
    } else {
        metadata.setEncodings(Lists.newArrayList(cacheEncodings));
        metadata.setRowCount(cacheRowCount);
    }
    if (LlapIoImpl.LOG.isTraceEnabled()) {
        LlapIoImpl.LOG.trace("Derived stripe metadata for this split is " + metadata);
    }
    consumer.setStripeMetadata(metadata);
    OrcEncodedColumnBatch ecb = useObjectPools ? ECB_POOL.take() : new OrcEncodedColumnBatch();
    ecb.init(fileKey, metadata.getStripeIx(), OrcEncodedColumnBatch.ALL_RGS, writerIncludes.length);
    // Skip the 0th column that is the root structure.
    for (int colIx = 1; colIx < writerIncludes.length; ++colIx) {
        if (!writerIncludes[colIx])
            continue;
        ecb.initColumn(colIx, OrcEncodedColumnBatch.MAX_DATA_STREAMS);
        if (!hasAllData && splitIncludes[colIx]) {
            // The column has been read from disk.
            List<CacheWriter.CacheStreamData> streams = diskData.colStreams.get(colIx);
            LlapSerDeDataBuffer[][] newCacheDataForCol = createArrayToCache(sliceToCache, colIx, streams);
            // Struct column, such as root?
            if (streams == null)
                continue;
            Iterator<CacheWriter.CacheStreamData> iter = streams.iterator();
            while (iter.hasNext()) {
                CacheWriter.CacheStreamData stream = iter.next();
                if (stream.isSuppressed) {
                    if (LlapIoImpl.LOG.isTraceEnabled()) {
                        LlapIoImpl.LOG.trace("Removing a suppressed stream " + stream.name);
                    }
                    iter.remove();
                    discardUncachedBuffers(stream.data);
                    continue;
                }
                int streamIx = setStreamDataToCache(newCacheDataForCol, stream);
                ColumnStreamData cb = useObjectPools ? CSD_POOL.take() : new ColumnStreamData();
                cb.incRef();
                cb.setCacheBuffers(stream.data);
                ecb.setStreamData(colIx, streamIx, cb);
            }
        } else {
            processColumnCacheData(cacheBuffers, ecb, colIx);
        }
    }
    if (processStop()) {
        recordReaderTime(startTime);
        return false;
    }
    // but for now just rely on the cache put to lock them before we send them over.
    if (LlapIoImpl.CACHE_LOGGER.isTraceEnabled()) {
        LlapIoImpl.CACHE_LOGGER.trace("Data to cache from the read " + sliceToCache);
    }
    cacheFileData(sliceToCache);
    return sendEcbToConsumer(ecb, cacheData != null, diskData);
}
Also used : StripeData(org.apache.hadoop.hive.llap.cache.SerDeLowLevelCacheImpl.StripeData) SerDeStripeMetadata(org.apache.hadoop.hive.llap.io.decode.GenericColumnVectorProducer.SerDeStripeMetadata) ColumnEncoding(org.apache.orc.OrcProto.ColumnEncoding) OrcEncodedColumnBatch(org.apache.hadoop.hive.ql.io.orc.encoded.Reader.OrcEncodedColumnBatch) LlapSerDeDataBuffer(org.apache.hadoop.hive.llap.cache.SerDeLowLevelCacheImpl.LlapSerDeDataBuffer) ColumnStreamData(org.apache.hadoop.hive.common.io.encoded.EncodedColumnBatch.ColumnStreamData)

Example 5 with StripeData

use of org.apache.hadoop.hive.llap.cache.SerDeLowLevelCacheImpl.StripeData in project hive by apache.

the class SerDeEncodedDataReader method createSliceToCache.

private StripeData createSliceToCache(CacheWriter.CacheStripeData diskData, StripeData cacheData) throws IOException {
    assert diskData != null;
    if (cacheData == null) {
        return new StripeData(diskData.knownTornStart, diskData.firstRowStart, diskData.lastRowStart, diskData.lastRowEnd, diskData.rowCount, diskData.encodings.toArray(new ColumnEncoding[diskData.encodings.size()]));
    } else {
        long rowCount = diskData.rowCount, encodingCount = diskData.encodings.size();
        validateCacheAndDisk(cacheData, rowCount, encodingCount, diskData);
        if (LlapIoImpl.LOG.isDebugEnabled()) {
            LlapIoImpl.LOG.debug("Creating slice to cache in addition to an existing slice " + cacheData.toCoordinateString() + "; disk offsets were " + diskData.toCoordinateString());
        }
        // Note: we could just do what we already do above from disk data, except for the validation
        // that is not strictly necessary, and knownTornStart which is an optimization.
        StripeData sliceToCache = StripeData.duplicateStructure(cacheData);
        for (int i = 0; i < diskData.encodings.size(); ++i) {
            sliceToCache.getEncodings()[i] = diskData.encodings.get(i);
        }
        sliceToCache.setKnownTornStart(Math.min(diskData.knownTornStart, sliceToCache.getKnownTornStart()));
        return sliceToCache;
    }
}
Also used : ColumnEncoding(org.apache.orc.OrcProto.ColumnEncoding) StripeData(org.apache.hadoop.hive.llap.cache.SerDeLowLevelCacheImpl.StripeData)

Aggregations

StripeData (org.apache.hadoop.hive.llap.cache.SerDeLowLevelCacheImpl.StripeData)5 ColumnEncoding (org.apache.orc.OrcProto.ColumnEncoding)3 AtomicBoolean (java.util.concurrent.atomic.AtomicBoolean)1 BooleanRef (org.apache.hadoop.hive.common.io.DataCache.BooleanRef)1 ColumnStreamData (org.apache.hadoop.hive.common.io.encoded.EncodedColumnBatch.ColumnStreamData)1 LlapSerDeDataBuffer (org.apache.hadoop.hive.llap.cache.SerDeLowLevelCacheImpl.LlapSerDeDataBuffer)1 SerDeStripeMetadata (org.apache.hadoop.hive.llap.io.decode.GenericColumnVectorProducer.SerDeStripeMetadata)1 OrcEncodedColumnBatch (org.apache.hadoop.hive.ql.io.orc.encoded.Reader.OrcEncodedColumnBatch)1 FileSplit (org.apache.hadoop.mapred.FileSplit)1