use of org.apache.orc.TypeDescription in project hive by apache.
the class TestVectorizedOrcAcidRowBatchReader method testVectorizedOrcAcidRowBatchReader.
private void testVectorizedOrcAcidRowBatchReader(String deleteEventRegistry) throws Exception {
List<OrcSplit> splits = getSplits();
// Mark one of the transactions as an exception to test that invalid transactions
// are being handled properly.
// Exclude transaction 5
conf.set(ValidTxnList.VALID_TXNS_KEY, "14:1:1:5");
VectorizedOrcAcidRowBatchReader vectorizedReader = new VectorizedOrcAcidRowBatchReader(splits.get(0), conf, Reporter.NULL);
if (deleteEventRegistry.equals(ColumnizedDeleteEventRegistry.class.getName())) {
assertTrue(vectorizedReader.getDeleteEventRegistry() instanceof ColumnizedDeleteEventRegistry);
}
if (deleteEventRegistry.equals(SortMergedDeleteEventRegistry.class.getName())) {
assertTrue(vectorizedReader.getDeleteEventRegistry() instanceof SortMergedDeleteEventRegistry);
}
TypeDescription schema = OrcInputFormat.getDesiredRowTypeDescr(conf, true, Integer.MAX_VALUE);
VectorizedRowBatch vectorizedRowBatch = schema.createRowBatch();
// set data column count as 1.
vectorizedRowBatch.setPartitionInfo(1, 0);
long previousPayload = Long.MIN_VALUE;
while (vectorizedReader.next(null, vectorizedRowBatch)) {
assertTrue(vectorizedRowBatch.selectedInUse);
LongColumnVector col = (LongColumnVector) vectorizedRowBatch.cols[0];
for (int i = 0; i < vectorizedRowBatch.size; ++i) {
int idx = vectorizedRowBatch.selected[i];
long payload = col.vector[idx];
long otid = (payload / NUM_ROWID_PER_OTID) + 1;
long rowId = payload % NUM_ROWID_PER_OTID;
assertFalse(rowId % 2 == 0 || rowId % 3 == 0);
// Check that txn#5 has been excluded.
assertTrue(otid != 5);
// Check that the data is in sorted order.
assertTrue(payload > previousPayload);
previousPayload = payload;
}
}
}
use of org.apache.orc.TypeDescription in project hive by apache.
the class TestOrcFile method testStripeLevelStats.
@Test
public void testStripeLevelStats() throws Exception {
ObjectInspector inspector;
synchronized (TestOrcFile.class) {
inspector = ObjectInspectorFactory.getReflectionObjectInspector(InnerStruct.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf).inspector(inspector).stripeSize(100000).bufferSize(10000).batchSize(1000));
for (int i = 0; i < 11000; i++) {
if (i >= 5000) {
if (i >= 10000) {
writer.addRow(new InnerStruct(3, "three"));
} else {
writer.addRow(new InnerStruct(2, "two"));
}
} else {
writer.addRow(new InnerStruct(1, "one"));
}
}
writer.close();
Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
TypeDescription schema = writer.getSchema();
assertEquals(2, schema.getMaximumId());
boolean[] expected = new boolean[] { false, true, false };
boolean[] included = OrcUtils.includeColumns("int1", schema);
assertEquals(true, Arrays.equals(expected, included));
List<StripeStatistics> stats = reader.getStripeStatistics();
int numStripes = stats.size();
assertEquals(3, numStripes);
StripeStatistics ss1 = stats.get(0);
StripeStatistics ss2 = stats.get(1);
StripeStatistics ss3 = stats.get(2);
assertEquals(5000, ss1.getColumnStatistics()[0].getNumberOfValues());
assertEquals(5000, ss2.getColumnStatistics()[0].getNumberOfValues());
assertEquals(1000, ss3.getColumnStatistics()[0].getNumberOfValues());
assertEquals(5000, (ss1.getColumnStatistics()[1]).getNumberOfValues());
assertEquals(5000, (ss2.getColumnStatistics()[1]).getNumberOfValues());
assertEquals(1000, (ss3.getColumnStatistics()[1]).getNumberOfValues());
assertEquals(1, ((IntegerColumnStatistics) ss1.getColumnStatistics()[1]).getMinimum());
assertEquals(2, ((IntegerColumnStatistics) ss2.getColumnStatistics()[1]).getMinimum());
assertEquals(3, ((IntegerColumnStatistics) ss3.getColumnStatistics()[1]).getMinimum());
assertEquals(1, ((IntegerColumnStatistics) ss1.getColumnStatistics()[1]).getMaximum());
assertEquals(2, ((IntegerColumnStatistics) ss2.getColumnStatistics()[1]).getMaximum());
assertEquals(3, ((IntegerColumnStatistics) ss3.getColumnStatistics()[1]).getMaximum());
assertEquals(5000, ((IntegerColumnStatistics) ss1.getColumnStatistics()[1]).getSum());
assertEquals(10000, ((IntegerColumnStatistics) ss2.getColumnStatistics()[1]).getSum());
assertEquals(3000, ((IntegerColumnStatistics) ss3.getColumnStatistics()[1]).getSum());
assertEquals(5000, (ss1.getColumnStatistics()[2]).getNumberOfValues());
assertEquals(5000, (ss2.getColumnStatistics()[2]).getNumberOfValues());
assertEquals(1000, (ss3.getColumnStatistics()[2]).getNumberOfValues());
assertEquals("one", ((StringColumnStatistics) ss1.getColumnStatistics()[2]).getMinimum());
assertEquals("two", ((StringColumnStatistics) ss2.getColumnStatistics()[2]).getMinimum());
assertEquals("three", ((StringColumnStatistics) ss3.getColumnStatistics()[2]).getMinimum());
assertEquals("one", ((StringColumnStatistics) ss1.getColumnStatistics()[2]).getMaximum());
assertEquals("two", ((StringColumnStatistics) ss2.getColumnStatistics()[2]).getMaximum());
assertEquals("three", ((StringColumnStatistics) ss3.getColumnStatistics()[2]).getMaximum());
assertEquals(15000, ((StringColumnStatistics) ss1.getColumnStatistics()[2]).getSum());
assertEquals(15000, ((StringColumnStatistics) ss2.getColumnStatistics()[2]).getSum());
assertEquals(5000, ((StringColumnStatistics) ss3.getColumnStatistics()[2]).getSum());
RecordReaderImpl recordReader = (RecordReaderImpl) reader.rows();
OrcProto.RowIndex[] index = recordReader.readRowIndex(0, null, null).getRowGroupIndex();
assertEquals(3, index.length);
List<OrcProto.RowIndexEntry> items = index[1].getEntryList();
assertEquals(1, items.size());
assertEquals(3, items.get(0).getPositionsCount());
assertEquals(0, items.get(0).getPositions(0));
assertEquals(0, items.get(0).getPositions(1));
assertEquals(0, items.get(0).getPositions(2));
assertEquals(1, items.get(0).getStatistics().getIntStatistics().getMinimum());
index = recordReader.readRowIndex(1, null, null).getRowGroupIndex();
assertEquals(3, index.length);
items = index[1].getEntryList();
assertEquals(2, items.get(0).getStatistics().getIntStatistics().getMaximum());
}
use of org.apache.orc.TypeDescription in project hive by apache.
the class OrcInputFormat method generateSplitsInfo.
static List<OrcSplit> generateSplitsInfo(Configuration conf, Context context) throws IOException {
if (LOG.isInfoEnabled()) {
LOG.info("ORC pushdown predicate: " + context.sarg);
}
boolean useFileIdsConfig = HiveConf.getBoolVar(conf, ConfVars.HIVE_ORC_INCLUDE_FILE_ID_IN_SPLITS);
// Sharing this state assumes splits will succeed or fail to get it together (same FS).
// We also start with null and only set it to true on the first call, so we would only do
// the global-disable thing on the first failure w/the API error, not any random failure.
Ref<Boolean> useFileIds = Ref.from(useFileIdsConfig ? null : false);
boolean allowSyntheticFileIds = useFileIdsConfig && HiveConf.getBoolVar(conf, ConfVars.HIVE_ORC_ALLOW_SYNTHETIC_FILE_ID_IN_SPLITS);
List<OrcSplit> splits = Lists.newArrayList();
List<Future<AcidDirInfo>> pathFutures = Lists.newArrayList();
List<Future<Void>> strategyFutures = Lists.newArrayList();
final List<Future<List<OrcSplit>>> splitFutures = Lists.newArrayList();
UserGroupInformation ugi = UserGroupInformation.getCurrentUser();
// multi-threaded file statuses and split strategy
Path[] paths = getInputPaths(conf);
CompletionService<AcidDirInfo> ecs = new ExecutorCompletionService<>(Context.threadPool);
for (Path dir : paths) {
FileSystem fs = dir.getFileSystem(conf);
FileGenerator fileGenerator = new FileGenerator(context, fs, dir, useFileIds, ugi);
pathFutures.add(ecs.submit(fileGenerator));
}
boolean isTransactionalTableScan = HiveConf.getBoolVar(conf, ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN);
boolean isSchemaEvolution = HiveConf.getBoolVar(conf, ConfVars.HIVE_SCHEMA_EVOLUTION);
TypeDescription readerSchema = OrcInputFormat.getDesiredRowTypeDescr(conf, isTransactionalTableScan, Integer.MAX_VALUE);
List<OrcProto.Type> readerTypes = null;
if (readerSchema != null) {
readerTypes = OrcUtils.getOrcTypes(readerSchema);
}
if (LOG.isDebugEnabled()) {
LOG.debug("Generate splits schema evolution property " + isSchemaEvolution + " reader schema " + (readerSchema == null ? "NULL" : readerSchema.toString()) + " transactional scan property " + isTransactionalTableScan);
}
// complete path futures and schedule split generation
try {
CombinedCtx combinedCtx = (context.splitStrategyBatchMs > 0) ? new CombinedCtx() : null;
long maxWaitUs = context.splitStrategyBatchMs * 1000000;
int resultsLeft = paths.length;
while (resultsLeft > 0) {
AcidDirInfo adi = null;
if (combinedCtx != null && combinedCtx.combined != null) {
long waitTimeUs = combinedCtx.combineStartUs + maxWaitUs - System.nanoTime();
if (waitTimeUs >= 0) {
Future<AcidDirInfo> f = ecs.poll(waitTimeUs, TimeUnit.NANOSECONDS);
adi = (f == null) ? null : f.get();
}
} else {
adi = ecs.take().get();
}
if (adi == null) {
// We were combining SS-es and the time has expired.
assert combinedCtx.combined != null;
scheduleSplits(combinedCtx.combined, context, splitFutures, strategyFutures, splits);
combinedCtx.combined = null;
continue;
}
// We have received a new directory information, make split strategies.
--resultsLeft;
// The reason why we can get a list of split strategies here is because for ACID split-update
// case when we have a mix of original base files & insert deltas, we will produce two
// independent split strategies for them. There is a global flag 'isOriginal' that is set
// on a per split strategy basis and it has to be same for all the files in that strategy.
List<SplitStrategy<?>> splitStrategies = determineSplitStrategies(combinedCtx, context, adi.fs, adi.splitPath, adi.acidInfo, adi.baseFiles, adi.parsedDeltas, readerTypes, ugi, allowSyntheticFileIds);
for (SplitStrategy<?> splitStrategy : splitStrategies) {
if (isDebugEnabled) {
LOG.debug("Split strategy: {}", splitStrategy);
}
// This works purely by magic, because we know which strategy produces which type.
if (splitStrategy instanceof ETLSplitStrategy) {
scheduleSplits((ETLSplitStrategy) splitStrategy, context, splitFutures, strategyFutures, splits);
} else {
@SuppressWarnings("unchecked") List<OrcSplit> readySplits = (List<OrcSplit>) splitStrategy.getSplits();
splits.addAll(readySplits);
}
}
}
// Run the last combined strategy, if any.
if (combinedCtx != null && combinedCtx.combined != null) {
scheduleSplits(combinedCtx.combined, context, splitFutures, strategyFutures, splits);
combinedCtx.combined = null;
}
// complete split futures
for (Future<Void> ssFuture : strategyFutures) {
// Make sure we get exceptions strategies might have thrown.
ssFuture.get();
}
// All the split strategies are done, so it must be safe to access splitFutures.
for (Future<List<OrcSplit>> splitFuture : splitFutures) {
splits.addAll(splitFuture.get());
}
} catch (Exception e) {
cancelFutures(pathFutures);
cancelFutures(strategyFutures);
cancelFutures(splitFutures);
throw new RuntimeException("ORC split generation failed with exception: " + e.getMessage(), e);
}
if (context.cacheStripeDetails) {
LOG.info("FooterCacheHitRatio: " + context.cacheHitCounter.get() + "/" + context.numFilesCounter.get());
}
if (isDebugEnabled) {
for (OrcSplit split : splits) {
LOG.debug(split + " projected_columns_uncompressed_size: " + split.getColumnarProjectionSize());
}
}
return splits;
}
use of org.apache.orc.TypeDescription in project hive by apache.
the class OrcInputFormat method pickStripesViaTranslatedSarg.
public static boolean[] pickStripesViaTranslatedSarg(SearchArgument sarg, OrcFile.WriterVersion writerVersion, List<OrcProto.Type> types, List<StripeStatistics> stripeStats, int stripeCount) {
LOG.info("Translated ORC pushdown predicate: " + sarg);
assert sarg != null;
if (stripeStats == null || writerVersion == OrcFile.WriterVersion.ORIGINAL) {
// only do split pruning if HIVE-8732 has been fixed in the writer
return null;
}
// eliminate stripes that doesn't satisfy the predicate condition
List<PredicateLeaf> sargLeaves = sarg.getLeaves();
int[] filterColumns = RecordReaderImpl.mapTranslatedSargColumns(types, sargLeaves);
TypeDescription schema = OrcUtils.convertTypeFromProtobuf(types, 0);
SchemaEvolution evolution = new SchemaEvolution(schema, null);
return pickStripesInternal(sarg, filterColumns, stripeStats, stripeCount, null, evolution);
}
use of org.apache.orc.TypeDescription in project hive by apache.
the class OrcInputFormat method createReaderFromFile.
public static RecordReader createReaderFromFile(Reader file, Configuration conf, long offset, long length) throws IOException {
boolean isTransactionalTableScan = HiveConf.getBoolVar(conf, ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN);
if (isTransactionalTableScan) {
raiseAcidTablesMustBeReadWithAcidReaderException(conf);
}
/**
* Do we have schema on read in the configuration variables?
*/
TypeDescription schema = getDesiredRowTypeDescr(conf, false, Integer.MAX_VALUE);
Reader.Options options = new Reader.Options().range(offset, length);
options.schema(schema);
boolean isOriginal = isOriginal(file);
if (schema == null) {
schema = file.getSchema();
}
List<OrcProto.Type> types = OrcUtils.getOrcTypes(schema);
options.include(genIncludedColumns(schema, conf));
setSearchArgument(options, types, conf, isOriginal);
return file.rowsOptions(options);
}
Aggregations