use of org.apache.hadoop.hive.ql.io.orc.encoded.OrcBatchKey in project hive by apache.
the class TestIncrementalObjectSizeEstimator method testMetadata.
@Test
public void testMetadata() throws IOException {
// Mostly tests that it doesn't crash.
OrcStripeMetadata osm = OrcStripeMetadata.createDummy(0);
HashMap<Class<?>, ObjectEstimator> map = IncrementalObjectSizeEstimator.createEstimators(osm);
IncrementalObjectSizeEstimator.addEstimator("com.google.protobuf.LiteralByteString", map);
ObjectEstimator root = map.get(OrcStripeMetadata.class);
LOG.info("Estimated " + root.estimate(osm, map) + " for a dummy OSM");
OrcBatchKey stripeKey = null;
DummyMetadataReader mr = new DummyMetadataReader();
mr.doStreamStep = false;
mr.isEmpty = true;
StripeInformation si = Mockito.mock(StripeInformation.class);
Mockito.when(si.getNumberOfRows()).thenReturn(0L);
osm = new OrcStripeMetadata(stripeKey, mr, si, null, null, null, null);
LOG.info("Estimated " + root.estimate(osm, map) + " for an empty OSM");
mr.doStreamStep = true;
osm = new OrcStripeMetadata(stripeKey, mr, si, null, null, null, null);
LOG.info("Estimated " + root.estimate(osm, map) + " for an empty OSM after serde");
mr.isEmpty = false;
stripeKey = new OrcBatchKey(0, 0, 0);
osm = new OrcStripeMetadata(stripeKey, mr, si, null, null, null, null);
LOG.info("Estimated " + root.estimate(osm, map) + " for a test OSM");
osm.resetRowIndex();
LOG.info("Estimated " + root.estimate(osm, map) + " for a test OSM w/o row index");
mr.doStreamStep = true;
osm = new OrcStripeMetadata(stripeKey, mr, si, null, null, null, null);
LOG.info("Estimated " + root.estimate(osm, map) + " for a test OSM after serde");
osm.resetRowIndex();
LOG.info("Estimated " + root.estimate(osm, map) + " for a test OSM w/o row index after serde");
OrcFileMetadata ofm = OrcFileMetadata.createDummy(0);
map = IncrementalObjectSizeEstimator.createEstimators(ofm);
IncrementalObjectSizeEstimator.addEstimator("com.google.protobuf.LiteralByteString", map);
root = map.get(OrcFileMetadata.class);
LOG.info("Estimated " + root.estimate(ofm, map) + " for a dummy OFM");
}
use of org.apache.hadoop.hive.ql.io.orc.encoded.OrcBatchKey in project hive by apache.
the class OrcEncodedDataReader method performDataRead.
protected Void performDataRead() throws IOException, InterruptedException {
long startTime = counters.startTimeCounter();
LlapIoImpl.LOG.info("Processing data for file {}: {}", fileKey, split.getPath());
if (processStop()) {
recordReaderTime(startTime);
return null;
}
counters.setDesc(QueryFragmentCounters.Desc.TABLE, cacheTag.getTableName());
counters.setDesc(QueryFragmentCounters.Desc.FILE, split.getPath() + (fileKey == null ? "" : " (" + fileKey + ")"));
try {
validateFileMetadata();
// 2. Determine which stripes to read based on the split.
determineStripesToRead();
} catch (Throwable t) {
handleReaderError(startTime, t);
return null;
}
if (stripeRgs.length == 0) {
consumer.setDone();
recordReaderTime(startTime);
tracePool.offer(trace);
// No data to read.
return null;
}
counters.setDesc(QueryFragmentCounters.Desc.STRIPES, stripeIxFrom + "," + stripeRgs.length);
// 3. Apply SARG if needed, and otherwise determine what RGs to read.
int stride = fileMetadata.getRowIndexStride();
ArrayList<OrcStripeMetadata> stripeMetadatas = null;
try {
if (sarg != null && stride != 0) {
// TODO: move this to a common method
// Note: this gets IDs by name, so we assume indices don't need to be adjusted for ACID.
int[] filterColumns = RecordReaderImpl.mapSargColumnsToOrcInternalColIdx(sarg.getLeaves(), evolution);
// included will not be null, row options will fill the array with trues if null
sargColumns = new boolean[evolution.getFileSchema().getMaximumId() + 1];
for (int i : filterColumns) {
// TODO: should this then be >=?
if (i > 0) {
sargColumns[i] = true;
}
}
// If SARG is present, get relevant stripe metadata from cache or readers.
stripeMetadatas = readStripesMetadata(fileIncludes, sargColumns);
}
// Now, apply SARG if any; w/o sarg, this will just initialize stripeRgs.
boolean hasData = determineRgsToRead(stride, stripeMetadatas);
if (!hasData) {
consumer.setDone();
recordReaderTime(startTime);
tracePool.offer(trace);
// No data to read.
return null;
}
} catch (Throwable t) {
handleReaderError(startTime, t);
return null;
}
if (processStop()) {
recordReaderTime(startTime);
return null;
}
// 4. Create encoded data reader.
try {
ensureDataReader();
} catch (Throwable t) {
handleReaderError(startTime, t);
return null;
}
// 6. Read data.
// TODO: I/O threadpool could be here - one thread per stripe; for now, linear.
boolean hasFileId = this.fileKey != null;
OrcBatchKey stripeKey = hasFileId ? new OrcBatchKey(fileKey, -1, 0) : null;
pathCache.touch(fileKey, split.getPath().toUri().toString());
for (int stripeIxMod = 0; stripeIxMod < stripeRgs.length; ++stripeIxMod) {
if (processStop()) {
recordReaderTime(startTime);
return null;
}
int stripeIx = stripeIxFrom + stripeIxMod;
boolean[] rgs = null;
OrcStripeMetadata stripeMetadata = null;
StripeInformation si;
try {
si = fileMetadata.getStripes().get(stripeIx);
LlapIoImpl.ORC_LOGGER.trace("Reading stripe {}: {}, {}", stripeIx, si.getOffset(), si.getLength());
trace.logReadingStripe(stripeIx, si.getOffset(), si.getLength());
rgs = stripeRgs[stripeIxMod];
if (LlapIoImpl.ORC_LOGGER.isTraceEnabled()) {
LlapIoImpl.ORC_LOGGER.trace("stripeRgs[{}]: {}", stripeIxMod, Arrays.toString(rgs));
}
// in EncodedReaderImpl, but for now it's not that important.
if (rgs == RecordReaderImpl.SargApplier.READ_NO_RGS)
continue;
// 6.2. Ensure we have stripe metadata. We might have read it before for RG filtering.
if (stripeMetadatas != null) {
stripeMetadata = stripeMetadatas.get(stripeIxMod);
} else {
stripeKey.stripeIx = stripeIx;
OrcProto.StripeFooter footer = getStripeFooterFromCacheOrDisk(si, stripeKey);
stripeMetadata = createOrcStripeMetadataObject(stripeIx, si, footer, fileIncludes, sargColumns);
ensureDataReader();
stripeReader.readIndexStreams(stripeMetadata.getIndex(), si, footer.getStreamsList(), fileIncludes, sargColumns);
consumer.setStripeMetadata(stripeMetadata);
}
} catch (Throwable t) {
handleReaderError(startTime, t);
return null;
}
if (processStop()) {
recordReaderTime(startTime);
return null;
}
// This is a sync call that will feed data to the consumer.
try {
// TODO: readEncodedColumns is not supposed to throw; errors should be propagated thru
// consumer. It is potentially holding locked buffers, and must perform its own cleanup.
// Also, currently readEncodedColumns is not stoppable. The consumer will discard the
// data it receives for one stripe. We could probably interrupt it, if it checked that.
stripeReader.readEncodedColumns(stripeIx, si, stripeMetadata.getRowIndexes(), stripeMetadata.getEncodings(), stripeMetadata.getStreams(), fileIncludes, rgs, consumer);
} catch (Throwable t) {
handleReaderError(startTime, t);
return null;
}
}
// Done with all the things.
recordReaderTime(startTime);
consumer.setDone();
LlapIoImpl.LOG.trace("done processing {}", split);
tracePool.offer(trace);
// Close the stripe reader, we are done reading.
cleanupReaders();
return null;
}
use of org.apache.hadoop.hive.ql.io.orc.encoded.OrcBatchKey in project hive by apache.
the class OrcEncodedDataReader method readStripesMetadata.
/**
* Reads the metadata for all stripes in the file.
*/
private ArrayList<OrcStripeMetadata> readStripesMetadata(boolean[] includes, boolean[] sargColumns) throws IOException {
ArrayList<OrcStripeMetadata> result = new ArrayList<OrcStripeMetadata>(stripeRgs.length);
boolean hasFileId = this.fileKey != null;
OrcBatchKey stripeKey = hasFileId ? new OrcBatchKey(fileKey, 0, 0) : null;
for (int stripeIxMod = 0; stripeIxMod < stripeRgs.length; ++stripeIxMod) {
int stripeIx = stripeIxMod + stripeIxFrom;
stripeKey.stripeIx = stripeIx;
StripeInformation si = fileMetadata.getStripes().get(stripeIx);
OrcProto.StripeFooter footer = getStripeFooterFromCacheOrDisk(si, stripeKey);
OrcStripeMetadata osm = createOrcStripeMetadataObject(stripeIx, si, footer, includes, sargColumns);
ensureDataReader();
OrcIndex index = osm.getIndex();
stripeReader.readIndexStreams(index, si, footer.getStreamsList(), includes, sargColumns);
result.add(osm);
consumer.setStripeMetadata(osm);
}
return result;
}
use of org.apache.hadoop.hive.ql.io.orc.encoded.OrcBatchKey in project hive by apache.
the class OrcEncodedDataReader method createOrcStripeMetadataObject.
private OrcStripeMetadata createOrcStripeMetadataObject(int stripeIx, StripeInformation si, OrcProto.StripeFooter footer, boolean[] includes, boolean[] sargColumns) throws IOException {
Stream.Kind[] bks = sargColumns == null ? null : new Stream.Kind[includes.length];
BloomFilterIndex[] bis = sargColumns == null ? null : new BloomFilterIndex[includes.length];
return new OrcStripeMetadata(new OrcBatchKey(fileKey, stripeIx, 0), footer, new OrcIndex(new RowIndex[includes.length], bks, bis), si);
}
Aggregations