use of org.apache.orc.StripeInformation in project hive by apache.
the class OrcEncodedDataReader method determineRgsToRead.
/**
* Determines which RGs need to be read, after stripes have been determined.
* SARG is applied, and readState is populated for each stripe accordingly.
*/
private boolean determineRgsToRead(boolean[] globalIncludes, int rowIndexStride, ArrayList<OrcStripeMetadata> metadata) throws IOException {
RecordReaderImpl.SargApplier sargApp = null;
if (sarg != null && rowIndexStride != 0) {
List<OrcProto.Type> types = fileMetadata.getTypes();
String[] colNamesForSarg = OrcInputFormat.getSargColumnNames(columnNames, types, globalIncludes, fileMetadata.isOriginalFormat());
sargApp = new RecordReaderImpl.SargApplier(sarg, colNamesForSarg, rowIndexStride, evolution, OrcFile.WriterVersion.from(fileMetadata.getWriterVersionNum()));
}
boolean hasAnyData = false;
// readState should have been initialized by this time with an empty array.
for (int stripeIxMod = 0; stripeIxMod < readState.length; ++stripeIxMod) {
int stripeIx = stripeIxMod + stripeIxFrom;
StripeInformation stripe = fileMetadata.getStripes().get(stripeIx);
int rgCount = getRgCount(stripe, rowIndexStride);
boolean[] rgsToRead = null;
if (sargApp != null) {
OrcStripeMetadata stripeMetadata = metadata.get(stripeIxMod);
rgsToRead = sargApp.pickRowGroups(stripe, stripeMetadata.getRowIndexes(), stripeMetadata.getBloomFilterKinds(), stripeMetadata.getBloomFilterIndexes(), true);
}
boolean isNone = rgsToRead == RecordReaderImpl.SargApplier.READ_NO_RGS, isAll = rgsToRead == RecordReaderImpl.SargApplier.READ_ALL_RGS;
hasAnyData = hasAnyData || !isNone;
if (LlapIoImpl.ORC_LOGGER.isTraceEnabled()) {
if (isNone) {
LlapIoImpl.ORC_LOGGER.trace("SARG eliminated all RGs for stripe {}", stripeIx);
} else if (!isAll) {
LlapIoImpl.ORC_LOGGER.trace("SARG picked RGs for stripe {}: {}", stripeIx, DebugUtils.toString(rgsToRead));
} else {
LlapIoImpl.ORC_LOGGER.trace("Will read all {} RGs for stripe {}", rgCount, stripeIx);
}
}
assert isAll || isNone || rgsToRead.length == rgCount;
int fileIncludesCount = 0;
// We don't need separate readState w/o HL cache, should get rid of that instead.
for (int includeIx = 1; includeIx < globalIncludes.length; ++includeIx) {
fileIncludesCount += (globalIncludes[includeIx] ? 1 : 0);
}
readState[stripeIxMod] = new boolean[fileIncludesCount][];
for (int includeIx = 0; includeIx < fileIncludesCount; ++includeIx) {
readState[stripeIxMod][includeIx] = (isAll || isNone) ? rgsToRead : Arrays.copyOf(rgsToRead, rgsToRead.length);
}
adjustRgMetric(rgCount, rgsToRead, isNone, isAll);
}
return hasAnyData;
}
use of org.apache.orc.StripeInformation in project hive by apache.
the class TestIncrementalObjectSizeEstimator method testMetadata.
@Test
public void testMetadata() throws IOException {
// Mostly tests that it doesn't crash.
OrcStripeMetadata osm = OrcStripeMetadata.createDummy(0);
HashMap<Class<?>, ObjectEstimator> map = IncrementalObjectSizeEstimator.createEstimators(osm);
IncrementalObjectSizeEstimator.addEstimator("com.google.protobuf.LiteralByteString", map);
ObjectEstimator root = map.get(OrcStripeMetadata.class);
LOG.info("Estimated " + root.estimate(osm, map) + " for a dummy OSM");
OrcBatchKey stripeKey = null;
DummyMetadataReader mr = new DummyMetadataReader();
mr.doStreamStep = false;
mr.isEmpty = true;
StripeInformation si = Mockito.mock(StripeInformation.class);
Mockito.when(si.getNumberOfRows()).thenReturn(0L);
osm = new OrcStripeMetadata(stripeKey, mr, si, null, null, null, null);
LOG.info("Estimated " + root.estimate(osm, map) + " for an empty OSM");
mr.doStreamStep = true;
osm = new OrcStripeMetadata(stripeKey, mr, si, null, null, null, null);
LOG.info("Estimated " + root.estimate(osm, map) + " for an empty OSM after serde");
mr.isEmpty = false;
stripeKey = new OrcBatchKey(0, 0, 0);
osm = new OrcStripeMetadata(stripeKey, mr, si, null, null, null, null);
LOG.info("Estimated " + root.estimate(osm, map) + " for a test OSM");
osm.resetRowIndex();
LOG.info("Estimated " + root.estimate(osm, map) + " for a test OSM w/o row index");
mr.doStreamStep = true;
osm = new OrcStripeMetadata(stripeKey, mr, si, null, null, null, null);
LOG.info("Estimated " + root.estimate(osm, map) + " for a test OSM after serde");
osm.resetRowIndex();
LOG.info("Estimated " + root.estimate(osm, map) + " for a test OSM w/o row index after serde");
OrcFileMetadata ofm = OrcFileMetadata.createDummy(0);
map = IncrementalObjectSizeEstimator.createEstimators(ofm);
IncrementalObjectSizeEstimator.addEstimator("com.google.protobuf.LiteralByteString", map);
root = map.get(OrcFileMetadata.class);
LOG.info("Estimated " + root.estimate(ofm, map) + " for a dummy OFM");
}
use of org.apache.orc.StripeInformation in project hive by apache.
the class OrcFileFormatProxy method applySargToMetadata.
@Override
public SplitInfos applySargToMetadata(SearchArgument sarg, ByteBuffer fileMetadata) throws IOException {
// TODO: ideally we should store shortened representation of only the necessary fields
// in HBase; it will probably require custom SARG application code.
OrcTail orcTail = ReaderImpl.extractFileTail(fileMetadata);
OrcProto.Footer footer = orcTail.getFooter();
int stripeCount = footer.getStripesCount();
boolean[] result = OrcInputFormat.pickStripesViaTranslatedSarg(sarg, orcTail.getWriterVersion(), footer.getTypesList(), orcTail.getStripeStatistics(), stripeCount);
// For ORC case, send the boundaries of the stripes so we don't have to send the footer.
SplitInfos.Builder sb = SplitInfos.newBuilder();
List<StripeInformation> stripes = orcTail.getStripes();
boolean isEliminated = true;
for (int i = 0; i < result.length; ++i) {
if (result != null && !result[i])
continue;
isEliminated = false;
StripeInformation si = stripes.get(i);
if (LOG.isDebugEnabled()) {
LOG.debug("PPD is adding a split " + i + ": " + si.getOffset() + ", " + si.getLength());
}
sb.addInfos(SplitInfo.newBuilder().setIndex(i).setOffset(si.getOffset()).setLength(si.getLength()));
}
return isEliminated ? null : sb.build();
}
use of org.apache.orc.StripeInformation in project flink by apache.
the class OrcShimV200 method getOffsetAndLengthForSplit.
@VisibleForTesting
public static Tuple2<Long, Long> getOffsetAndLengthForSplit(long splitStart, long splitLength, List<StripeInformation> stripes) {
long splitEnd = splitStart + splitLength;
long readStart = Long.MAX_VALUE;
long readEnd = Long.MIN_VALUE;
for (StripeInformation s : stripes) {
if (splitStart <= s.getOffset() && s.getOffset() < splitEnd) {
// stripe starts in split, so it is included
readStart = Math.min(readStart, s.getOffset());
readEnd = Math.max(readEnd, s.getOffset() + s.getLength());
}
}
if (readStart < Long.MAX_VALUE) {
// at least one split is included
return Tuple2.of(readStart, readEnd - readStart);
} else {
return Tuple2.of(0L, 0L);
}
}
use of org.apache.orc.StripeInformation in project hive by apache.
the class OrcEncodedDataReader method getFileFooterFromCacheOrDisk.
/**
* Gets file metadata for the split from cache, or reads it from the file.
*/
private OrcFileMetadata getFileFooterFromCacheOrDisk() throws IOException {
LlapBufferOrBuffers tailBuffers = null;
List<StripeStatistics> stats = null;
List<StripeInformation> stripes = null;
boolean hasCache = fileKey != null && metadataCache != null;
if (hasCache) {
tailBuffers = metadataCache.getFileMetadata(fileKey);
if (tailBuffers != null) {
try {
OrcTail orcTail = getOrcTailFromLlapBuffers(tailBuffers);
counters.incrCounter(LlapIOCounters.METADATA_CACHE_HIT);
FileTail tail = orcTail.getFileTail();
stats = getStripeStatsFromOrcTail(orcTail);
stripes = new ArrayList<>(tail.getFooter().getStripesCount());
int stripeIdx = 0;
for (OrcProto.StripeInformation stripeProto : tail.getFooter().getStripesList()) {
stripes.add(new ReaderImpl.StripeInformationImpl(stripeProto, stripeIdx++, -1, null));
}
return new OrcFileMetadata(fileKey, tail.getFooter(), tail.getPostscript(), stats, stripes, ReaderImpl.getFileVersion(tail.getPostscript().getVersionList()));
} finally {
// We don't need the buffer anymore.
metadataCache.decRefBuffer(tailBuffers);
}
} else {
counters.incrCounter(LlapIOCounters.METADATA_CACHE_MISS);
throwIfCacheOnlyRead(isReadCacheOnly);
}
}
ensureOrcReader();
ByteBuffer tailBufferBb = orcReader.getSerializedFileFooter();
if (hasCache) {
tailBuffers = metadataCache.putFileMetadata(fileKey, tailBufferBb, cacheTag, isStopped);
// We don't use the cache's copy of the buffer.
metadataCache.decRefBuffer(tailBuffers);
}
FileTail ft = orcReader.getFileTail();
return new OrcFileMetadata(fileKey, ft.getFooter(), ft.getPostscript(), orcReader.getOrcProtoStripeStatistics(), orcReader.getStripes(), orcReader.getFileVersion());
}
Aggregations