use of org.apache.hadoop.hive.llap.cache.SerDeLowLevelCacheImpl.StripeData in project hive by apache.
the class SerDeEncodedDataReader method performDataRead.
protected Void performDataRead() throws IOException, InterruptedException {
boolean isOk = false;
try {
try {
long startTime = counters.startTimeCounter();
LlapIoImpl.LOG.info("Processing data for {}", split.getPath());
if (processStop()) {
recordReaderTime(startTime);
return null;
}
Boolean isFromCache = null;
try {
isFromCache = readFileWithCache(startTime);
} finally {
// so we expect to have stuff remaining in there only in case of errors.
if (cachedData != null && cachedData.getData() != null) {
for (StripeData sd : cachedData.getData()) {
unlockAllBuffers(sd);
}
cachedData = null;
}
}
// Stop requested, and handled inside.
if (isFromCache == null)
return null;
if (!isFromCache) {
if (!processOneFileSplit(split, startTime, Ref.from(0), null))
return null;
}
// Done with all the things.
recordReaderTime(startTime);
if (LlapIoImpl.LOG.isTraceEnabled()) {
LlapIoImpl.LOG.trace("done processing {}", split);
}
} catch (Throwable e) {
LlapIoImpl.LOG.error("Exception while processing", e);
consumer.setError(e);
throw e;
}
consumer.setDone();
isOk = true;
return null;
} finally {
cleanup(!isOk);
// Do not clean up the writers - the callback should do it.
}
}
use of org.apache.hadoop.hive.llap.cache.SerDeLowLevelCacheImpl.StripeData in project hive by apache.
the class SerDeEncodedDataReader method readFileWithCache.
public Boolean readFileWithCache(long startTime) throws IOException, InterruptedException {
if (fileKey == null)
return false;
BooleanRef gotAllData = new BooleanRef();
long endOfSplit = split.getStart() + split.getLength();
this.cachedData = cache.getFileData(fileKey, split.getStart(), endOfSplit, writerIncludes, CC_FACTORY, counters, gotAllData);
if (!gotAllData.value) {
throwIfCacheOnlyRead(isReadCacheOnly);
}
if (cachedData == null) {
if (LlapIoImpl.CACHE_LOGGER.isTraceEnabled()) {
LlapIoImpl.CACHE_LOGGER.trace("No data for the split found in cache");
}
return false;
}
String[] hosts = extractHosts(split, false), inMemoryHosts = extractHosts(split, true);
List<StripeData> slices = cachedData.getData();
if (slices.isEmpty())
return false;
long uncachedPrefixEnd = slices.get(0).getKnownTornStart(), uncachedSuffixStart = slices.get(slices.size() - 1).getLastEnd(), lastStripeLastStart = slices.get(slices.size() - 1).getLastStart();
Ref<Integer> stripeIx = Ref.from(0);
if (uncachedPrefixEnd > split.getStart()) {
// TODO: can we merge neighboring splits? So we don't init so many readers.
FileSplit sliceSplit = new FileSplit(split.getPath(), split.getStart(), uncachedPrefixEnd - split.getStart(), hosts, inMemoryHosts);
if (!processOneFileSplit(sliceSplit, startTime, stripeIx, null))
return null;
}
while (!slices.isEmpty()) {
StripeData slice = slices.get(0);
long start = slice.getKnownTornStart();
// Will also read the last row.
long len = slice.getLastStart() - start;
FileSplit sliceSplit = new FileSplit(split.getPath(), start, len, hosts, inMemoryHosts);
if (!processOneFileSplit(sliceSplit, startTime, stripeIx, slice))
return null;
}
boolean isUnfortunate = false;
if (uncachedSuffixStart == endOfSplit) {
// This is rather obscure. The end of last row cached is precisely at the split end offset.
// If the split is in the middle of the file, LRR would read one more row after that,
// therefore as unfortunate as it is, we have to do a one-row read. However, for that to
// have happened, someone should have supplied a split that ends inside the last row, i.e.
// a few bytes earlier than the current split, which is pretty unlikely. What is more likely
// is that the split, and the last row, both end at the end of file. Check for this.
long size = split.getPath().getFileSystem(daemonConf).getFileStatus(split.getPath()).getLen();
isUnfortunate = size > endOfSplit;
if (isUnfortunate) {
// Log at warn, given how unfortunate this is.
LlapIoImpl.LOG.warn("One-row mismatch at the end of split " + split.getPath() + " at " + endOfSplit + "; file size is " + size);
}
}
if (uncachedSuffixStart < endOfSplit || isUnfortunate) {
// Note: we assume 0-length split is correct given now LRR interprets offsets (reading an
// extra row). Should we instead assume 1+ chars and add 1 for isUnfortunate?
// Do not read from uncachedSuffixStart as LineRecordReader skips first row
FileSplit splitPart = new FileSplit(split.getPath(), lastStripeLastStart, endOfSplit - lastStripeLastStart, hosts, inMemoryHosts);
if (!processOneFileSplit(splitPart, startTime, stripeIx, null))
return null;
}
return true;
}
use of org.apache.hadoop.hive.llap.cache.SerDeLowLevelCacheImpl.StripeData in project hive by apache.
the class SerDeEncodedDataReader method processAsyncCacheData.
private void processAsyncCacheData(CacheWriter.CacheStripeData diskData, boolean[] splitIncludes) throws IOException {
StripeData sliceToCache = new StripeData(diskData.knownTornStart, diskData.firstRowStart, diskData.lastRowStart, diskData.lastRowEnd, diskData.rowCount, diskData.encodings.toArray(new ColumnEncoding[diskData.encodings.size()]));
for (int colIx = 0; colIx < splitIncludes.length; ++colIx) {
if (!splitIncludes[colIx])
continue;
// The column has been read from disk.
List<CacheWriter.CacheStreamData> streams = diskData.colStreams.get(colIx);
LlapSerDeDataBuffer[][] newCacheDataForCol = createArrayToCache(sliceToCache, colIx, streams);
// Struct column, such as root?
if (streams == null)
continue;
Iterator<CacheWriter.CacheStreamData> iter = streams.iterator();
while (iter.hasNext()) {
CacheWriter.CacheStreamData stream = iter.next();
if (stream.isSuppressed) {
if (LlapIoImpl.LOG.isTraceEnabled()) {
LlapIoImpl.LOG.trace("Removing a suppressed stream " + stream.name);
}
iter.remove();
discardUncachedBuffers(stream.data);
continue;
}
setStreamDataToCache(newCacheDataForCol, stream);
}
}
if (LlapIoImpl.CACHE_LOGGER.isTraceEnabled()) {
LlapIoImpl.CACHE_LOGGER.trace("Data to cache from async read " + sliceToCache);
}
try {
cacheFileData(sliceToCache);
} finally {
unlockAllBuffers(sliceToCache);
}
}
use of org.apache.hadoop.hive.llap.cache.SerDeLowLevelCacheImpl.StripeData in project hive by apache.
the class SerDeEncodedDataReader method processOneSlice.
private boolean processOneSlice(CacheWriter.CacheStripeData diskData, boolean[] splitIncludes, int stripeIx, StripeData cacheData, long startTime) throws IOException, InterruptedException {
logProcessOneSlice(stripeIx, diskData, cacheData);
ColumnEncoding[] cacheEncodings = cacheData == null ? null : cacheData.getEncodings();
LlapSerDeDataBuffer[][][] cacheBuffers = cacheData == null ? null : cacheData.getData();
long cacheRowCount = cacheData == null ? -1L : cacheData.getRowCount();
SerDeStripeMetadata metadata = new SerDeStripeMetadata(stripeIx);
StripeData sliceToCache = null;
boolean hasAllData = diskData == null;
if (!hasAllData) {
sliceToCache = createSliceToCache(diskData, cacheData);
metadata.setEncodings(combineCacheAndWriterEncodings(cacheEncodings, diskData.encodings));
metadata.setRowCount(diskData.rowCount);
} else {
metadata.setEncodings(Lists.newArrayList(cacheEncodings));
metadata.setRowCount(cacheRowCount);
}
if (LlapIoImpl.LOG.isTraceEnabled()) {
LlapIoImpl.LOG.trace("Derived stripe metadata for this split is " + metadata);
}
consumer.setStripeMetadata(metadata);
OrcEncodedColumnBatch ecb = useObjectPools ? ECB_POOL.take() : new OrcEncodedColumnBatch();
ecb.init(fileKey, metadata.getStripeIx(), OrcEncodedColumnBatch.ALL_RGS, writerIncludes.length);
// Skip the 0th column that is the root structure.
for (int colIx = 1; colIx < writerIncludes.length; ++colIx) {
if (!writerIncludes[colIx])
continue;
ecb.initColumn(colIx, OrcEncodedColumnBatch.MAX_DATA_STREAMS);
if (!hasAllData && splitIncludes[colIx]) {
// The column has been read from disk.
List<CacheWriter.CacheStreamData> streams = diskData.colStreams.get(colIx);
LlapSerDeDataBuffer[][] newCacheDataForCol = createArrayToCache(sliceToCache, colIx, streams);
// Struct column, such as root?
if (streams == null)
continue;
Iterator<CacheWriter.CacheStreamData> iter = streams.iterator();
while (iter.hasNext()) {
CacheWriter.CacheStreamData stream = iter.next();
if (stream.isSuppressed) {
if (LlapIoImpl.LOG.isTraceEnabled()) {
LlapIoImpl.LOG.trace("Removing a suppressed stream " + stream.name);
}
iter.remove();
discardUncachedBuffers(stream.data);
continue;
}
int streamIx = setStreamDataToCache(newCacheDataForCol, stream);
ColumnStreamData cb = useObjectPools ? CSD_POOL.take() : new ColumnStreamData();
cb.incRef();
cb.setCacheBuffers(stream.data);
ecb.setStreamData(colIx, streamIx, cb);
}
} else {
processColumnCacheData(cacheBuffers, ecb, colIx);
}
}
if (processStop()) {
recordReaderTime(startTime);
return false;
}
// but for now just rely on the cache put to lock them before we send them over.
if (LlapIoImpl.CACHE_LOGGER.isTraceEnabled()) {
LlapIoImpl.CACHE_LOGGER.trace("Data to cache from the read " + sliceToCache);
}
cacheFileData(sliceToCache);
return sendEcbToConsumer(ecb, cacheData != null, diskData);
}
use of org.apache.hadoop.hive.llap.cache.SerDeLowLevelCacheImpl.StripeData in project hive by apache.
the class SerDeEncodedDataReader method createSliceToCache.
private StripeData createSliceToCache(CacheWriter.CacheStripeData diskData, StripeData cacheData) throws IOException {
assert diskData != null;
if (cacheData == null) {
return new StripeData(diskData.knownTornStart, diskData.firstRowStart, diskData.lastRowStart, diskData.lastRowEnd, diskData.rowCount, diskData.encodings.toArray(new ColumnEncoding[diskData.encodings.size()]));
} else {
long rowCount = diskData.rowCount, encodingCount = diskData.encodings.size();
validateCacheAndDisk(cacheData, rowCount, encodingCount, diskData);
if (LlapIoImpl.LOG.isDebugEnabled()) {
LlapIoImpl.LOG.debug("Creating slice to cache in addition to an existing slice " + cacheData.toCoordinateString() + "; disk offsets were " + diskData.toCoordinateString());
}
// Note: we could just do what we already do above from disk data, except for the validation
// that is not strictly necessary, and knownTornStart which is an optimization.
StripeData sliceToCache = StripeData.duplicateStructure(cacheData);
for (int i = 0; i < diskData.encodings.size(); ++i) {
sliceToCache.getEncodings()[i] = diskData.encodings.get(i);
}
sliceToCache.setKnownTornStart(Math.min(diskData.knownTornStart, sliceToCache.getKnownTornStart()));
return sliceToCache;
}
}
Aggregations