Search in sources :

Example 1 with TypeDescription

use of org.apache.orc.TypeDescription in project hive by apache.

the class TestVectorizedOrcAcidRowBatchReader method testVectorizedOrcAcidRowBatchReader.

private void testVectorizedOrcAcidRowBatchReader(String deleteEventRegistry) throws Exception {
    List<OrcSplit> splits = getSplits();
    // Mark one of the transactions as an exception to test that invalid transactions
    // are being handled properly.
    // Exclude transaction 5
    conf.set(ValidTxnList.VALID_TXNS_KEY, "14:1:1:5");
    VectorizedOrcAcidRowBatchReader vectorizedReader = new VectorizedOrcAcidRowBatchReader(splits.get(0), conf, Reporter.NULL);
    if (deleteEventRegistry.equals(ColumnizedDeleteEventRegistry.class.getName())) {
        assertTrue(vectorizedReader.getDeleteEventRegistry() instanceof ColumnizedDeleteEventRegistry);
    }
    if (deleteEventRegistry.equals(SortMergedDeleteEventRegistry.class.getName())) {
        assertTrue(vectorizedReader.getDeleteEventRegistry() instanceof SortMergedDeleteEventRegistry);
    }
    TypeDescription schema = OrcInputFormat.getDesiredRowTypeDescr(conf, true, Integer.MAX_VALUE);
    VectorizedRowBatch vectorizedRowBatch = schema.createRowBatch();
    // set data column count as 1.
    vectorizedRowBatch.setPartitionInfo(1, 0);
    long previousPayload = Long.MIN_VALUE;
    while (vectorizedReader.next(null, vectorizedRowBatch)) {
        assertTrue(vectorizedRowBatch.selectedInUse);
        LongColumnVector col = (LongColumnVector) vectorizedRowBatch.cols[0];
        for (int i = 0; i < vectorizedRowBatch.size; ++i) {
            int idx = vectorizedRowBatch.selected[i];
            long payload = col.vector[idx];
            long otid = (payload / NUM_ROWID_PER_OTID) + 1;
            long rowId = payload % NUM_ROWID_PER_OTID;
            assertFalse(rowId % 2 == 0 || rowId % 3 == 0);
            // Check that txn#5 has been excluded.
            assertTrue(otid != 5);
            // Check that the data is in sorted order.
            assertTrue(payload > previousPayload);
            previousPayload = payload;
        }
    }
}
Also used : VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) TypeDescription(org.apache.orc.TypeDescription) ColumnizedDeleteEventRegistry(org.apache.hadoop.hive.ql.io.orc.VectorizedOrcAcidRowBatchReader.ColumnizedDeleteEventRegistry) LongColumnVector(org.apache.hadoop.hive.ql.exec.vector.LongColumnVector) SortMergedDeleteEventRegistry(org.apache.hadoop.hive.ql.io.orc.VectorizedOrcAcidRowBatchReader.SortMergedDeleteEventRegistry)

Example 2 with TypeDescription

use of org.apache.orc.TypeDescription in project hive by apache.

the class TestOrcFile method testStripeLevelStats.

@Test
public void testStripeLevelStats() throws Exception {
    ObjectInspector inspector;
    synchronized (TestOrcFile.class) {
        inspector = ObjectInspectorFactory.getReflectionObjectInspector(InnerStruct.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf).inspector(inspector).stripeSize(100000).bufferSize(10000).batchSize(1000));
    for (int i = 0; i < 11000; i++) {
        if (i >= 5000) {
            if (i >= 10000) {
                writer.addRow(new InnerStruct(3, "three"));
            } else {
                writer.addRow(new InnerStruct(2, "two"));
            }
        } else {
            writer.addRow(new InnerStruct(1, "one"));
        }
    }
    writer.close();
    Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
    TypeDescription schema = writer.getSchema();
    assertEquals(2, schema.getMaximumId());
    boolean[] expected = new boolean[] { false, true, false };
    boolean[] included = OrcUtils.includeColumns("int1", schema);
    assertEquals(true, Arrays.equals(expected, included));
    List<StripeStatistics> stats = reader.getStripeStatistics();
    int numStripes = stats.size();
    assertEquals(3, numStripes);
    StripeStatistics ss1 = stats.get(0);
    StripeStatistics ss2 = stats.get(1);
    StripeStatistics ss3 = stats.get(2);
    assertEquals(5000, ss1.getColumnStatistics()[0].getNumberOfValues());
    assertEquals(5000, ss2.getColumnStatistics()[0].getNumberOfValues());
    assertEquals(1000, ss3.getColumnStatistics()[0].getNumberOfValues());
    assertEquals(5000, (ss1.getColumnStatistics()[1]).getNumberOfValues());
    assertEquals(5000, (ss2.getColumnStatistics()[1]).getNumberOfValues());
    assertEquals(1000, (ss3.getColumnStatistics()[1]).getNumberOfValues());
    assertEquals(1, ((IntegerColumnStatistics) ss1.getColumnStatistics()[1]).getMinimum());
    assertEquals(2, ((IntegerColumnStatistics) ss2.getColumnStatistics()[1]).getMinimum());
    assertEquals(3, ((IntegerColumnStatistics) ss3.getColumnStatistics()[1]).getMinimum());
    assertEquals(1, ((IntegerColumnStatistics) ss1.getColumnStatistics()[1]).getMaximum());
    assertEquals(2, ((IntegerColumnStatistics) ss2.getColumnStatistics()[1]).getMaximum());
    assertEquals(3, ((IntegerColumnStatistics) ss3.getColumnStatistics()[1]).getMaximum());
    assertEquals(5000, ((IntegerColumnStatistics) ss1.getColumnStatistics()[1]).getSum());
    assertEquals(10000, ((IntegerColumnStatistics) ss2.getColumnStatistics()[1]).getSum());
    assertEquals(3000, ((IntegerColumnStatistics) ss3.getColumnStatistics()[1]).getSum());
    assertEquals(5000, (ss1.getColumnStatistics()[2]).getNumberOfValues());
    assertEquals(5000, (ss2.getColumnStatistics()[2]).getNumberOfValues());
    assertEquals(1000, (ss3.getColumnStatistics()[2]).getNumberOfValues());
    assertEquals("one", ((StringColumnStatistics) ss1.getColumnStatistics()[2]).getMinimum());
    assertEquals("two", ((StringColumnStatistics) ss2.getColumnStatistics()[2]).getMinimum());
    assertEquals("three", ((StringColumnStatistics) ss3.getColumnStatistics()[2]).getMinimum());
    assertEquals("one", ((StringColumnStatistics) ss1.getColumnStatistics()[2]).getMaximum());
    assertEquals("two", ((StringColumnStatistics) ss2.getColumnStatistics()[2]).getMaximum());
    assertEquals("three", ((StringColumnStatistics) ss3.getColumnStatistics()[2]).getMaximum());
    assertEquals(15000, ((StringColumnStatistics) ss1.getColumnStatistics()[2]).getSum());
    assertEquals(15000, ((StringColumnStatistics) ss2.getColumnStatistics()[2]).getSum());
    assertEquals(5000, ((StringColumnStatistics) ss3.getColumnStatistics()[2]).getSum());
    RecordReaderImpl recordReader = (RecordReaderImpl) reader.rows();
    OrcProto.RowIndex[] index = recordReader.readRowIndex(0, null, null).getRowGroupIndex();
    assertEquals(3, index.length);
    List<OrcProto.RowIndexEntry> items = index[1].getEntryList();
    assertEquals(1, items.size());
    assertEquals(3, items.get(0).getPositionsCount());
    assertEquals(0, items.get(0).getPositions(0));
    assertEquals(0, items.get(0).getPositions(1));
    assertEquals(0, items.get(0).getPositions(2));
    assertEquals(1, items.get(0).getStatistics().getIntStatistics().getMinimum());
    index = recordReader.readRowIndex(1, null, null).getRowGroupIndex();
    assertEquals(3, index.length);
    items = index[1].getEntryList();
    assertEquals(2, items.get(0).getStatistics().getIntStatistics().getMaximum());
}
Also used : HiveDecimalObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector) BooleanObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector) ShortObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) MapObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) FloatObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector) StringObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector) ListObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector) IntObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector) LongObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector) BinaryObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector) ByteObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector) DoubleObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector) TimestampObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector) StripeStatistics(org.apache.orc.StripeStatistics) TypeDescription(org.apache.orc.TypeDescription) Test(org.junit.Test)

Example 3 with TypeDescription

use of org.apache.orc.TypeDescription in project hive by apache.

the class OrcInputFormat method generateSplitsInfo.

static List<OrcSplit> generateSplitsInfo(Configuration conf, Context context) throws IOException {
    if (LOG.isInfoEnabled()) {
        LOG.info("ORC pushdown predicate: " + context.sarg);
    }
    boolean useFileIdsConfig = HiveConf.getBoolVar(conf, ConfVars.HIVE_ORC_INCLUDE_FILE_ID_IN_SPLITS);
    // Sharing this state assumes splits will succeed or fail to get it together (same FS).
    // We also start with null and only set it to true on the first call, so we would only do
    // the global-disable thing on the first failure w/the API error, not any random failure.
    Ref<Boolean> useFileIds = Ref.from(useFileIdsConfig ? null : false);
    boolean allowSyntheticFileIds = useFileIdsConfig && HiveConf.getBoolVar(conf, ConfVars.HIVE_ORC_ALLOW_SYNTHETIC_FILE_ID_IN_SPLITS);
    List<OrcSplit> splits = Lists.newArrayList();
    List<Future<AcidDirInfo>> pathFutures = Lists.newArrayList();
    List<Future<Void>> strategyFutures = Lists.newArrayList();
    final List<Future<List<OrcSplit>>> splitFutures = Lists.newArrayList();
    UserGroupInformation ugi = UserGroupInformation.getCurrentUser();
    // multi-threaded file statuses and split strategy
    Path[] paths = getInputPaths(conf);
    CompletionService<AcidDirInfo> ecs = new ExecutorCompletionService<>(Context.threadPool);
    for (Path dir : paths) {
        FileSystem fs = dir.getFileSystem(conf);
        FileGenerator fileGenerator = new FileGenerator(context, fs, dir, useFileIds, ugi);
        pathFutures.add(ecs.submit(fileGenerator));
    }
    boolean isTransactionalTableScan = HiveConf.getBoolVar(conf, ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN);
    boolean isSchemaEvolution = HiveConf.getBoolVar(conf, ConfVars.HIVE_SCHEMA_EVOLUTION);
    TypeDescription readerSchema = OrcInputFormat.getDesiredRowTypeDescr(conf, isTransactionalTableScan, Integer.MAX_VALUE);
    List<OrcProto.Type> readerTypes = null;
    if (readerSchema != null) {
        readerTypes = OrcUtils.getOrcTypes(readerSchema);
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug("Generate splits schema evolution property " + isSchemaEvolution + " reader schema " + (readerSchema == null ? "NULL" : readerSchema.toString()) + " transactional scan property " + isTransactionalTableScan);
    }
    // complete path futures and schedule split generation
    try {
        CombinedCtx combinedCtx = (context.splitStrategyBatchMs > 0) ? new CombinedCtx() : null;
        long maxWaitUs = context.splitStrategyBatchMs * 1000000;
        int resultsLeft = paths.length;
        while (resultsLeft > 0) {
            AcidDirInfo adi = null;
            if (combinedCtx != null && combinedCtx.combined != null) {
                long waitTimeUs = combinedCtx.combineStartUs + maxWaitUs - System.nanoTime();
                if (waitTimeUs >= 0) {
                    Future<AcidDirInfo> f = ecs.poll(waitTimeUs, TimeUnit.NANOSECONDS);
                    adi = (f == null) ? null : f.get();
                }
            } else {
                adi = ecs.take().get();
            }
            if (adi == null) {
                // We were combining SS-es and the time has expired.
                assert combinedCtx.combined != null;
                scheduleSplits(combinedCtx.combined, context, splitFutures, strategyFutures, splits);
                combinedCtx.combined = null;
                continue;
            }
            // We have received a new directory information, make split strategies.
            --resultsLeft;
            // The reason why we can get a list of split strategies here is because for ACID split-update
            // case when we have a mix of original base files & insert deltas, we will produce two
            // independent split strategies for them. There is a global flag 'isOriginal' that is set
            // on a per split strategy basis and it has to be same for all the files in that strategy.
            List<SplitStrategy<?>> splitStrategies = determineSplitStrategies(combinedCtx, context, adi.fs, adi.splitPath, adi.acidInfo, adi.baseFiles, adi.parsedDeltas, readerTypes, ugi, allowSyntheticFileIds);
            for (SplitStrategy<?> splitStrategy : splitStrategies) {
                if (isDebugEnabled) {
                    LOG.debug("Split strategy: {}", splitStrategy);
                }
                // This works purely by magic, because we know which strategy produces which type.
                if (splitStrategy instanceof ETLSplitStrategy) {
                    scheduleSplits((ETLSplitStrategy) splitStrategy, context, splitFutures, strategyFutures, splits);
                } else {
                    @SuppressWarnings("unchecked") List<OrcSplit> readySplits = (List<OrcSplit>) splitStrategy.getSplits();
                    splits.addAll(readySplits);
                }
            }
        }
        // Run the last combined strategy, if any.
        if (combinedCtx != null && combinedCtx.combined != null) {
            scheduleSplits(combinedCtx.combined, context, splitFutures, strategyFutures, splits);
            combinedCtx.combined = null;
        }
        // complete split futures
        for (Future<Void> ssFuture : strategyFutures) {
            // Make sure we get exceptions strategies might have thrown.
            ssFuture.get();
        }
        // All the split strategies are done, so it must be safe to access splitFutures.
        for (Future<List<OrcSplit>> splitFuture : splitFutures) {
            splits.addAll(splitFuture.get());
        }
    } catch (Exception e) {
        cancelFutures(pathFutures);
        cancelFutures(strategyFutures);
        cancelFutures(splitFutures);
        throw new RuntimeException("ORC split generation failed with exception: " + e.getMessage(), e);
    }
    if (context.cacheStripeDetails) {
        LOG.info("FooterCacheHitRatio: " + context.cacheHitCounter.get() + "/" + context.numFilesCounter.get());
    }
    if (isDebugEnabled) {
        for (OrcSplit split : splits) {
            LOG.debug(split + " projected_columns_uncompressed_size: " + split.getColumnarProjectionSize());
        }
    }
    return splits;
}
Also used : ExecutorCompletionService(java.util.concurrent.ExecutorCompletionService) FileSystem(org.apache.hadoop.fs.FileSystem) TypeDescription(org.apache.orc.TypeDescription) ValidReadTxnList(org.apache.hadoop.hive.common.ValidReadTxnList) ArrayList(java.util.ArrayList) ValidTxnList(org.apache.hadoop.hive.common.ValidTxnList) List(java.util.List) UserGroupInformation(org.apache.hadoop.security.UserGroupInformation) Path(org.apache.hadoop.fs.Path) IOException(java.io.IOException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) Future(java.util.concurrent.Future)

Example 4 with TypeDescription

use of org.apache.orc.TypeDescription in project hive by apache.

the class OrcInputFormat method pickStripesViaTranslatedSarg.

public static boolean[] pickStripesViaTranslatedSarg(SearchArgument sarg, OrcFile.WriterVersion writerVersion, List<OrcProto.Type> types, List<StripeStatistics> stripeStats, int stripeCount) {
    LOG.info("Translated ORC pushdown predicate: " + sarg);
    assert sarg != null;
    if (stripeStats == null || writerVersion == OrcFile.WriterVersion.ORIGINAL) {
        // only do split pruning if HIVE-8732 has been fixed in the writer
        return null;
    }
    // eliminate stripes that doesn't satisfy the predicate condition
    List<PredicateLeaf> sargLeaves = sarg.getLeaves();
    int[] filterColumns = RecordReaderImpl.mapTranslatedSargColumns(types, sargLeaves);
    TypeDescription schema = OrcUtils.convertTypeFromProtobuf(types, 0);
    SchemaEvolution evolution = new SchemaEvolution(schema, null);
    return pickStripesInternal(sarg, filterColumns, stripeStats, stripeCount, null, evolution);
}
Also used : PredicateLeaf(org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf) TypeDescription(org.apache.orc.TypeDescription) SchemaEvolution(org.apache.orc.impl.SchemaEvolution)

Example 5 with TypeDescription

use of org.apache.orc.TypeDescription in project hive by apache.

the class OrcInputFormat method createReaderFromFile.

public static RecordReader createReaderFromFile(Reader file, Configuration conf, long offset, long length) throws IOException {
    boolean isTransactionalTableScan = HiveConf.getBoolVar(conf, ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN);
    if (isTransactionalTableScan) {
        raiseAcidTablesMustBeReadWithAcidReaderException(conf);
    }
    /**
     * Do we have schema on read in the configuration variables?
     */
    TypeDescription schema = getDesiredRowTypeDescr(conf, false, Integer.MAX_VALUE);
    Reader.Options options = new Reader.Options().range(offset, length);
    options.schema(schema);
    boolean isOriginal = isOriginal(file);
    if (schema == null) {
        schema = file.getSchema();
    }
    List<OrcProto.Type> types = OrcUtils.getOrcTypes(schema);
    options.include(genIncludedColumns(schema, conf));
    setSearchArgument(options, types, conf, isOriginal);
    return file.rowsOptions(options);
}
Also used : TypeDescription(org.apache.orc.TypeDescription) StatsProvidingRecordReader(org.apache.hadoop.hive.ql.io.StatsProvidingRecordReader) BatchToRowReader(org.apache.hadoop.hive.ql.io.BatchToRowReader)

Aggregations

TypeDescription (org.apache.orc.TypeDescription)24 ArrayList (java.util.ArrayList)6 VectorizedRowBatch (org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch)5 Test (org.junit.Test)5 Path (org.apache.hadoop.fs.Path)4 BytesColumnVector (org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector)4 LongColumnVector (org.apache.hadoop.hive.ql.exec.vector.LongColumnVector)4 ListObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector)4 MapObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector)4 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)4 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)4 BinaryObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector)4 BooleanObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector)4 ByteObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector)4 DoubleObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector)4 FloatObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector)4 HiveDecimalObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector)4 StructColumnVector (org.apache.hadoop.hive.ql.exec.vector.StructColumnVector)3 IntObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector)3 LongObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector)3