use of org.apache.orc.StripeInformation in project hive by apache.
the class OrcEncodedDataReader method performDataRead.
protected Void performDataRead() throws IOException, InterruptedException {
long startTime = counters.startTimeCounter();
LlapIoImpl.LOG.info("Processing data for file {}: {}", fileKey, split.getPath());
if (processStop()) {
recordReaderTime(startTime);
return null;
}
counters.setDesc(QueryFragmentCounters.Desc.TABLE, cacheTag.getTableName());
counters.setDesc(QueryFragmentCounters.Desc.FILE, split.getPath() + (fileKey == null ? "" : " (" + fileKey + ")"));
try {
validateFileMetadata();
// 2. Determine which stripes to read based on the split.
determineStripesToRead();
} catch (Throwable t) {
handleReaderError(startTime, t);
return null;
}
if (stripeRgs.length == 0) {
consumer.setDone();
recordReaderTime(startTime);
tracePool.offer(trace);
// No data to read.
return null;
}
counters.setDesc(QueryFragmentCounters.Desc.STRIPES, stripeIxFrom + "," + stripeRgs.length);
// 3. Apply SARG if needed, and otherwise determine what RGs to read.
int stride = fileMetadata.getRowIndexStride();
ArrayList<OrcStripeMetadata> stripeMetadatas = null;
try {
if (sarg != null && stride != 0) {
// TODO: move this to a common method
// Note: this gets IDs by name, so we assume indices don't need to be adjusted for ACID.
int[] filterColumns = RecordReaderImpl.mapSargColumnsToOrcInternalColIdx(sarg.getLeaves(), evolution);
// included will not be null, row options will fill the array with trues if null
sargColumns = new boolean[evolution.getFileSchema().getMaximumId() + 1];
for (int i : filterColumns) {
// TODO: should this then be >=?
if (i > 0) {
sargColumns[i] = true;
}
}
// If SARG is present, get relevant stripe metadata from cache or readers.
stripeMetadatas = readStripesMetadata(fileIncludes, sargColumns);
}
// Now, apply SARG if any; w/o sarg, this will just initialize stripeRgs.
boolean hasData = determineRgsToRead(stride, stripeMetadatas);
if (!hasData) {
consumer.setDone();
recordReaderTime(startTime);
tracePool.offer(trace);
// No data to read.
return null;
}
} catch (Throwable t) {
handleReaderError(startTime, t);
return null;
}
if (processStop()) {
recordReaderTime(startTime);
return null;
}
// 4. Create encoded data reader.
try {
ensureDataReader();
} catch (Throwable t) {
handleReaderError(startTime, t);
return null;
}
// 6. Read data.
// TODO: I/O threadpool could be here - one thread per stripe; for now, linear.
boolean hasFileId = this.fileKey != null;
OrcBatchKey stripeKey = hasFileId ? new OrcBatchKey(fileKey, -1, 0) : null;
pathCache.touch(fileKey, split.getPath().toUri().toString());
for (int stripeIxMod = 0; stripeIxMod < stripeRgs.length; ++stripeIxMod) {
if (processStop()) {
recordReaderTime(startTime);
return null;
}
int stripeIx = stripeIxFrom + stripeIxMod;
boolean[] rgs = null;
OrcStripeMetadata stripeMetadata = null;
StripeInformation si;
try {
si = fileMetadata.getStripes().get(stripeIx);
LlapIoImpl.ORC_LOGGER.trace("Reading stripe {}: {}, {}", stripeIx, si.getOffset(), si.getLength());
trace.logReadingStripe(stripeIx, si.getOffset(), si.getLength());
rgs = stripeRgs[stripeIxMod];
if (LlapIoImpl.ORC_LOGGER.isTraceEnabled()) {
LlapIoImpl.ORC_LOGGER.trace("stripeRgs[{}]: {}", stripeIxMod, Arrays.toString(rgs));
}
// in EncodedReaderImpl, but for now it's not that important.
if (rgs == RecordReaderImpl.SargApplier.READ_NO_RGS)
continue;
// 6.2. Ensure we have stripe metadata. We might have read it before for RG filtering.
if (stripeMetadatas != null) {
stripeMetadata = stripeMetadatas.get(stripeIxMod);
} else {
stripeKey.stripeIx = stripeIx;
OrcProto.StripeFooter footer = getStripeFooterFromCacheOrDisk(si, stripeKey);
stripeMetadata = createOrcStripeMetadataObject(stripeIx, si, footer, fileIncludes, sargColumns);
ensureDataReader();
stripeReader.readIndexStreams(stripeMetadata.getIndex(), si, footer.getStreamsList(), fileIncludes, sargColumns);
consumer.setStripeMetadata(stripeMetadata);
}
} catch (Throwable t) {
handleReaderError(startTime, t);
return null;
}
if (processStop()) {
recordReaderTime(startTime);
return null;
}
// This is a sync call that will feed data to the consumer.
try {
// TODO: readEncodedColumns is not supposed to throw; errors should be propagated thru
// consumer. It is potentially holding locked buffers, and must perform its own cleanup.
// Also, currently readEncodedColumns is not stoppable. The consumer will discard the
// data it receives for one stripe. We could probably interrupt it, if it checked that.
stripeReader.readEncodedColumns(stripeIx, si, stripeMetadata.getRowIndexes(), stripeMetadata.getEncodings(), stripeMetadata.getStreams(), fileIncludes, rgs, consumer);
} catch (Throwable t) {
handleReaderError(startTime, t);
return null;
}
}
// Done with all the things.
recordReaderTime(startTime);
consumer.setDone();
LlapIoImpl.LOG.trace("done processing {}", split);
tracePool.offer(trace);
// Close the stripe reader, we are done reading.
cleanupReaders();
return null;
}
use of org.apache.orc.StripeInformation in project hive by apache.
the class OrcEncodedDataReader method determineRgsToRead.
/**
* Determines which RGs need to be read, after stripes have been determined.
* SARG is applied, and readState is populated for each stripe accordingly.
*/
private boolean determineRgsToRead(int rowIndexStride, ArrayList<OrcStripeMetadata> metadata) throws IOException {
RecordReaderImpl.SargApplier sargApp = null;
if (sarg != null && rowIndexStride != 0) {
sargApp = new RecordReaderImpl.SargApplier(sarg, rowIndexStride, evolution, OrcFile.WriterVersion.from(OrcFile.WriterImplementation.ORC_JAVA, fileMetadata.getWriterVersionNum()), true, fileMetadata.getCalendar() == CalendarKind.PROLEPTIC_GREGORIAN, true);
}
boolean hasAnyData = false;
// stripeRgs should have been initialized by this time with an empty array.
for (int stripeIxMod = 0; stripeIxMod < stripeRgs.length; ++stripeIxMod) {
int stripeIx = stripeIxMod + stripeIxFrom;
StripeInformation stripe = fileMetadata.getStripes().get(stripeIx);
int rgCount = getRgCount(stripe, rowIndexStride);
boolean[] rgsToRead = null;
if (sargApp != null) {
OrcStripeMetadata stripeMetadata = metadata.get(stripeIxMod);
rgsToRead = sargApp.pickRowGroups(stripe, stripeMetadata.getRowIndexes(), stripeMetadata.getBloomFilterKinds(), stripeMetadata.getEncodings(), stripeMetadata.getBloomFilterIndexes(), true);
}
boolean isNone = rgsToRead == RecordReaderImpl.SargApplier.READ_NO_RGS, isAll = rgsToRead == RecordReaderImpl.SargApplier.READ_ALL_RGS;
hasAnyData = hasAnyData || !isNone;
if (LlapIoImpl.ORC_LOGGER.isTraceEnabled()) {
if (isNone) {
LlapIoImpl.ORC_LOGGER.trace("SARG eliminated all RGs for stripe {}", stripeIx);
trace.logSargResult(stripeIx, 0);
} else if (!isAll) {
LlapIoImpl.ORC_LOGGER.trace("SARG picked RGs for stripe {}: {}", stripeIx, DebugUtils.toString(rgsToRead));
trace.logSargResult(stripeIx, rgsToRead);
} else {
LlapIoImpl.ORC_LOGGER.trace("Will read all {} RGs for stripe {}", rgCount, stripeIx);
trace.logSargResult(stripeIx, rgCount);
}
}
assert isAll || isNone || rgsToRead.length == rgCount;
stripeRgs[stripeIxMod] = (isAll || isNone) ? rgsToRead : Arrays.copyOf(rgsToRead, rgsToRead.length);
adjustRgMetric(rgCount, rgsToRead, isNone, isAll);
}
return hasAnyData;
}
use of org.apache.orc.StripeInformation in project hive by apache.
the class OrcEncodedDataReader method readStripesMetadata.
/**
* Reads the metadata for all stripes in the file.
*/
private ArrayList<OrcStripeMetadata> readStripesMetadata(boolean[] includes, boolean[] sargColumns) throws IOException {
ArrayList<OrcStripeMetadata> result = new ArrayList<OrcStripeMetadata>(stripeRgs.length);
boolean hasFileId = this.fileKey != null;
OrcBatchKey stripeKey = hasFileId ? new OrcBatchKey(fileKey, 0, 0) : null;
for (int stripeIxMod = 0; stripeIxMod < stripeRgs.length; ++stripeIxMod) {
int stripeIx = stripeIxMod + stripeIxFrom;
stripeKey.stripeIx = stripeIx;
StripeInformation si = fileMetadata.getStripes().get(stripeIx);
OrcProto.StripeFooter footer = getStripeFooterFromCacheOrDisk(si, stripeKey);
OrcStripeMetadata osm = createOrcStripeMetadataObject(stripeIx, si, footer, includes, sargColumns);
ensureDataReader();
OrcIndex index = osm.getIndex();
stripeReader.readIndexStreams(index, si, footer.getStreamsList(), includes, sargColumns);
result.add(osm);
consumer.setStripeMetadata(osm);
}
return result;
}
use of org.apache.orc.StripeInformation in project hive by apache.
the class OrcEncodedDataReader method determineStripesToRead.
/**
* Determine which stripes to read for a split. Populates stripeIxFrom and stripeRgs.
*/
public void determineStripesToRead() {
// The unit of caching for ORC is (rg x column) (see OrcBatchKey).
List<StripeInformation> stripes = fileMetadata.getStripes();
long offset = split.getStart(), maxOffset = offset + split.getLength();
stripeIxFrom = -1;
int stripeIxTo = -1;
if (LlapIoImpl.ORC_LOGGER.isDebugEnabled()) {
String tmp = "FileSplit {" + split.getStart() + ", " + split.getLength() + "}; stripes ";
for (StripeInformation stripe : stripes) {
tmp += "{" + stripe.getOffset() + ", " + stripe.getLength() + "}, ";
}
LlapIoImpl.ORC_LOGGER.debug(tmp);
}
int stripeIx = 0;
for (StripeInformation stripe : stripes) {
long stripeStart = stripe.getOffset();
if (offset > stripeStart) {
// We assume splits will never start in the middle of the stripe.
++stripeIx;
continue;
}
if (stripeIxFrom == -1) {
LlapIoImpl.ORC_LOGGER.trace("Including stripes from {} ({} >= {})", stripeIx, stripeStart, offset);
stripeIxFrom = stripeIx;
}
if (stripeStart >= maxOffset) {
stripeIxTo = stripeIx;
LlapIoImpl.ORC_LOGGER.trace("Including stripes until {} ({} >= {}); {} stripes", stripeIxTo, stripeStart, maxOffset, (stripeIxTo - stripeIxFrom));
break;
}
++stripeIx;
}
if (stripeIxFrom == -1) {
LlapIoImpl.LOG.info("Not including any stripes - empty split");
}
if (stripeIxTo == -1 && stripeIxFrom != -1) {
stripeIxTo = stripeIx;
LlapIoImpl.ORC_LOGGER.trace("Including stripes until {} (end of file); {} stripes", stripeIx, (stripeIxTo - stripeIxFrom));
}
stripeRgs = new boolean[stripeIxTo - stripeIxFrom][];
}
use of org.apache.orc.StripeInformation in project hive by apache.
the class TestOrcFile method testMemoryManagementV12.
@Test
public void testMemoryManagementV12() throws Exception {
OrcConf.ROWS_BETWEEN_CHECKS.setLong(conf, 100);
final long poolSize = 50_000;
ObjectInspector inspector;
synchronized (TestOrcFile.class) {
inspector = ObjectInspectorFactory.getReflectionObjectInspector(InnerStruct.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
MemoryManager memoryManager = new MemoryManagerImpl(poolSize);
// set up 10 files that all request the full size.
MemoryManager.Callback ignore = newScale -> false;
for (int f = 0; f < 9; ++f) {
memoryManager.addWriter(new Path("file-" + f), poolSize, ignore);
}
Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf).inspector(inspector).compress(CompressionKind.NONE).stripeSize(50000).bufferSize(100).rowIndexStride(0).memory(memoryManager).batchSize(100).version(OrcFile.Version.V_0_12));
assertEquals(0.1, ((MemoryManagerImpl) memoryManager).getAllocationScale());
for (int i = 0; i < 2500; ++i) {
writer.addRow(new InnerStruct(i * 300, Integer.toHexString(10 * i)));
}
writer.close();
Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
int i = 0;
for (StripeInformation stripe : reader.getStripes()) {
i += 1;
assertTrue("stripe " + i + " is too long at " + stripe.getDataLength(), stripe.getDataLength() < 5000);
}
// with HIVE-7832, the dictionaries will be disabled after writing the first
// stripe as there are too many distinct values. Hence only 3 stripes as
// compared to 25 stripes in version 0.11 (above test case)
assertEquals(3, i);
assertEquals(2500, reader.getNumberOfRows());
reader.close();
}
Aggregations