Search in sources :

Example 1 with CacheWriter

use of org.apache.hadoop.hive.llap.io.encoded.SerDeEncodedDataReader.CacheWriter in project hive by apache.

the class SerDeEncodedDataReader method processOneFileSplit.

public boolean processOneFileSplit(FileSplit split, long startTime, Ref<Integer> stripeIxRef, StripeData slice) throws IOException {
    LlapIoImpl.LOG.info("Processing one split {" + split.getPath() + ", " + split.getStart() + ", " + split.getLength() + "}");
    if (LlapIoImpl.CACHE_LOGGER.isTraceEnabled()) {
        LlapIoImpl.CACHE_LOGGER.trace("Cache data for the split is " + slice);
    }
    boolean[] splitIncludes = Arrays.copyOf(writerIncludes, writerIncludes.length);
    boolean hasAllData = slice != null && determineSplitIncludes(slice, splitIncludes, writerIncludes);
    // 3) No data is in the cache. Multiple slices, disk read and multiple cache puts.
    if (hasAllData) {
        // Everything comes from cache.
        CacheWriter.CacheStripeData csd = null;
        boolean result = processOneSlice(csd, splitIncludes, stripeIxRef.value, slice, startTime);
        ++stripeIxRef.value;
        return result;
    }
    boolean result = false;
    // This initializes currentFileRead.
    startReadSplitFromFile(split, splitIncludes, slice);
    try {
        if (slice != null) {
            // If we had a cache range already, we expect a single matching disk slice.
            Vectors vectors = currentFileRead.readNextSlice();
            if (!vectors.isSupported()) {
                // Not in VRB mode - the new cache data is ready, we should use it.
                CacheWriter cacheWriter = currentFileRead.getCacheWriter();
                assert cacheWriter.stripes.size() == 1;
                result = processOneSlice(cacheWriter.stripes.get(0), splitIncludes, stripeIxRef.value, slice, startTime);
            } else {
                // VRB mode - process the VRBs with cache data; the new cache data is coming later.
                result = processOneSlice(vectors, splitIncludes, stripeIxRef.value, slice, startTime);
            }
            assert null == currentFileRead.readNextSlice();
            ++stripeIxRef.value;
        } else {
            // All the data comes from disk. The reader may have split it into multiple slices.
            Vectors vectors = currentFileRead.readNextSlice();
            assert vectors != null;
            result = true;
            if (!vectors.isSupported()) {
                // Force the rest of the data thru.
                while (currentFileRead.readNextSlice() != null) ;
                CacheWriter cacheWriter = currentFileRead.getCacheWriter();
                for (CacheWriter.CacheStripeData csd : cacheWriter.stripes) {
                    if (!processOneSlice(csd, splitIncludes, stripeIxRef.value, null, startTime)) {
                        result = false;
                        break;
                    }
                    ++stripeIxRef.value;
                }
            } else {
                // VRB mode - process the VRBs with cache data; the new cache data is coming later.
                do {
                    assert vectors.isSupported();
                    if (!processOneSlice(vectors, splitIncludes, stripeIxRef.value, null, startTime)) {
                        result = false;
                        break;
                    }
                    ++stripeIxRef.value;
                } while ((vectors = currentFileRead.readNextSlice()) != null);
            }
        }
    } finally {
        cleanUpCurrentRead();
    }
    return result;
}
Also used : CacheWriter(org.apache.hadoop.hive.llap.io.encoded.SerDeEncodedDataReader.CacheWriter)

Example 2 with CacheWriter

use of org.apache.hadoop.hive.llap.io.encoded.SerDeEncodedDataReader.CacheWriter in project hive by apache.

the class SerDeEncodedDataReader method startReadSplitFromFile.

public void startReadSplitFromFile(FileSplit split, boolean[] splitIncludes, StripeData slice) throws IOException {
    boolean maySplitTheSplit = slice == null;
    ReaderWithOffsets offsetReader = null;
    @SuppressWarnings("rawtypes") RecordReader sourceReader = sourceInputFormat.getRecordReader(split, jobConf, reporter);
    try {
        offsetReader = createOffsetReader(sourceReader);
        sourceReader = null;
    } finally {
        if (sourceReader != null) {
            try {
                sourceReader.close();
            } catch (Exception ex) {
                LlapIoImpl.LOG.error("Failed to close source reader", ex);
            }
        }
    }
    maySplitTheSplit = maySplitTheSplit && offsetReader.hasOffsets();
    try {
        StructObjectInspector originalOi = (StructObjectInspector) getOiFromSerDe();
        List<Integer> splitColumnIds = OrcInputFormat.genIncludedColumnsReverse(schema, splitIncludes, false);
        // fileread writes to the writer, which writes to orcWriter, which writes to cacheWriter
        EncodingWriter writer = VectorDeserializeOrcWriter.create(sourceInputFormat, sourceSerDe, parts, daemonConf, jobConf, split.getPath(), originalOi, splitColumnIds, splitIncludes, allocSize);
        // TODO: move this into ctor? EW would need to create CacheWriter then
        List<Integer> cwColIds = writer.isOnlyWritingIncludedColumns() ? splitColumnIds : columnIds;
        writer.init(new CacheWriter(bufferManager, cwColIds, splitIncludes, writer.isOnlyWritingIncludedColumns()), daemonConf, split.getPath());
        if (writer instanceof VectorDeserializeOrcWriter) {
            VectorDeserializeOrcWriter asyncWriter = (VectorDeserializeOrcWriter) writer;
            asyncWriter.startAsync(new AsyncCacheDataCallback());
            this.asyncWriters.add(asyncWriter);
        }
        currentFileRead = new FileReaderYieldReturn(offsetReader, split, writer, maySplitTheSplit, targetSliceRowCount);
    } finally {
        // Assignment is the last thing in the try, so if it happen we assume success.
        if (currentFileRead != null)
            return;
        if (offsetReader == null)
            return;
        try {
            offsetReader.close();
        } catch (Exception ex) {
            LlapIoImpl.LOG.error("Failed to close source reader", ex);
        }
    }
}
Also used : RecordReader(org.apache.hadoop.mapred.RecordReader) LineRecordReader(org.apache.hadoop.mapred.LineRecordReader) IOException(java.io.IOException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) CacheWriter(org.apache.hadoop.hive.llap.io.encoded.SerDeEncodedDataReader.CacheWriter) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)

Aggregations

CacheWriter (org.apache.hadoop.hive.llap.io.encoded.SerDeEncodedDataReader.CacheWriter)2 IOException (java.io.IOException)1 SerDeException (org.apache.hadoop.hive.serde2.SerDeException)1 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)1 LineRecordReader (org.apache.hadoop.mapred.LineRecordReader)1 RecordReader (org.apache.hadoop.mapred.RecordReader)1