Search in sources :

Example 26 with VectorizedRowBatchCtx

use of org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx in project hive by apache.

the class TestVectorizedOrcAcidRowBatchReader method testDeleteEventOriginalFiltering2.

private void testDeleteEventOriginalFiltering2() throws Exception {
    boolean filterOn = HiveConf.getBoolVar(conf, HiveConf.ConfVars.FILTER_DELETE_EVENTS);
    conf.setBoolean(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL, false);
    // Need to use a bigger row than DummyRow for the writer to flush the stripes
    conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, BigRow.getColumnNamesProperty());
    conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, BigRow.getColumnTypesProperty());
    Properties properties = new Properties();
    OrcFile.WriterOptions writerOptions = OrcFile.writerOptions(properties, conf);
    writerOptions.inspector(bigOriginalRowInspector).stripeSize(1).batchSize(1);
    String originalFile = "000000_0";
    Path originalFilePath = new Path(root, originalFile);
    byte[] data = new byte[1000];
    Writer writer = OrcFile.createWriter(originalFilePath, writerOptions);
    writer.addRow(new BigOriginalRow(data));
    writer.addRow(new BigOriginalRow(data));
    writer.addRow(new BigOriginalRow(data));
    writer.close();
    Reader reader = OrcFile.createReader(originalFilePath, OrcFile.readerOptions(conf));
    List<StripeInformation> stripes = reader.getStripes();
    // Make sure 3 stripes are created
    assertEquals(3, stripes.size());
    FileStatus fileStatus = fs.getFileStatus(originalFilePath);
    long fileLength = fileStatus.getLen();
    // Set vector mode to true in the map work so that we can generate the syntheticProps
    MapWork mapWork = new MapWork();
    mapWork.setVectorMode(true);
    VectorizedRowBatchCtx vrbContext = new VectorizedRowBatchCtx();
    mapWork.setVectorizedRowBatchCtx(vrbContext);
    HiveConf.setVar(conf, HiveConf.ConfVars.PLAN, "//tmp");
    Utilities.setMapWork(conf, mapWork);
    OrcSplit.OffsetAndBucketProperty syntheticProps = VectorizedOrcAcidRowBatchReader.computeOffsetAndBucket(fileStatus, root, true, true, conf);
    AcidOutputFormat.Options options = new AcidOutputFormat.Options(conf).bucket(0);
    int bucketProperty = BucketCodec.V1.encode(options);
    // 1. Splits within a stripe
    // A split that's completely within the 2nd stripe
    StripeInformation stripe = stripes.get(1);
    OrcSplit split = new OrcSplit(originalFilePath, null, stripe.getOffset() + 50, stripe.getLength() - 100, new String[] { "localhost" }, null, true, true, getDeltaMetaDataWithBucketFile(0), fileLength, fileLength, root, syntheticProps);
    validateKeyInterval(split, new RecordIdentifier(0, bucketProperty, 2), new RecordIdentifier(0, bucketProperty, 1), filterOn);
    // A split that's completely within the last stripe
    stripe = stripes.get(2);
    split = new OrcSplit(originalFilePath, null, stripe.getOffset() + 50, stripe.getLength() - 100, new String[] { "localhost" }, null, true, true, getDeltaMetaDataWithBucketFile(0), fileLength, fileLength, root, syntheticProps);
    validateKeyInterval(split, new RecordIdentifier(0, bucketProperty, 3), new RecordIdentifier(0, bucketProperty, 2), filterOn);
    // 2. Splits starting at a stripe boundary
    // A split that starts where the 1st stripe starts and ends before the 1st stripe ends
    stripe = stripes.get(0);
    split = new OrcSplit(originalFilePath, null, stripe.getOffset(), stripe.getLength() - 50, new String[] { "localhost" }, null, true, true, getDeltaMetaDataWithBucketFile(0), fileLength, fileLength, root, syntheticProps);
    // The key interval for the 1st stripe
    validateKeyInterval(split, new RecordIdentifier(0, bucketProperty, 0), new RecordIdentifier(0, bucketProperty, 0), filterOn);
    // A split that starts where the 2nd stripe starts and ends after the 2nd stripe ends
    stripe = stripes.get(1);
    split = new OrcSplit(originalFilePath, null, stripe.getOffset(), stripe.getLength() + 50, new String[] { "localhost" }, null, true, true, getDeltaMetaDataWithBucketFile(0), fileLength, fileLength, root, syntheticProps);
    // The key interval for the last 2 stripes
    validateKeyInterval(split, new RecordIdentifier(0, bucketProperty, 1), new RecordIdentifier(0, bucketProperty, 2), filterOn);
    // 3. Splits ending at a stripe boundary
    // A split that starts before the last stripe starts and ends at the last stripe boundary
    stripe = stripes.get(2);
    split = new OrcSplit(originalFilePath, null, stripe.getOffset() - 50, stripe.getLength() + 50, new String[] { "localhost" }, null, true, true, getDeltaMetaDataWithBucketFile(0), fileLength, fileLength, root, syntheticProps);
    // The key interval for the last stripe
    validateKeyInterval(split, new RecordIdentifier(0, bucketProperty, 2), new RecordIdentifier(0, bucketProperty, 2), filterOn);
    // A split that starts after the 1st stripe starts and ends where the last stripe ends
    split = new OrcSplit(originalFilePath, null, stripes.get(0).getOffset() + 50, reader.getContentLength() - 50, new String[] { "localhost" }, null, true, true, getDeltaMetaDataWithBucketFile(0), fileLength, fileLength, root, syntheticProps);
    // The key interval for the last 2 stripes
    validateKeyInterval(split, new RecordIdentifier(0, bucketProperty, 1), new RecordIdentifier(0, bucketProperty, 2), filterOn);
    // A split that starts where the 1st stripe starts and ends where the last stripe ends
    split = new OrcSplit(originalFilePath, null, stripes.get(0).getOffset(), reader.getContentLength(), new String[] { "localhost" }, null, true, true, getDeltaMetaDataWithBucketFile(0), fileLength, fileLength, root, syntheticProps);
    // The key interval for all 3 stripes
    validateKeyInterval(split, new RecordIdentifier(0, bucketProperty, 0), new RecordIdentifier(0, bucketProperty, 2), filterOn);
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) Properties(java.util.Properties) AcidOutputFormat(org.apache.hadoop.hive.ql.io.AcidOutputFormat) RecordIdentifier(org.apache.hadoop.hive.ql.io.RecordIdentifier) VectorizedRowBatchCtx(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) StripeInformation(org.apache.orc.StripeInformation)

Example 27 with VectorizedRowBatchCtx

use of org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx in project hive by apache.

the class TestVectorizedOrcAcidRowBatchReader method testDeleteEventFiltering2.

private void testDeleteEventFiltering2() throws Exception {
    boolean filterOn = HiveConf.getBoolVar(conf, HiveConf.ConfVars.FILTER_DELETE_EVENTS);
    boolean skipKeyIdx = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVETESTMODEACIDKEYIDXSKIP);
    int bucket = 1;
    AcidOutputFormat.Options options = new AcidOutputFormat.Options(conf).filesystem(fs).bucket(bucket).writingBase(true).minimumWriteId(10000002).maximumWriteId(10000002).inspector(inspector).reporter(Reporter.NULL).recordIdColumn(1).finalDestination(root);
    int bucketProperty = BucketCodec.V1.encode(options);
    // create data that looks like a compacted base that includes some data
    // from 'original' files and some from native Acid write
    RecordUpdater updater = new OrcRecordUpdater(root, options);
    updater.insert(0, new DummyRow(1, 0, 0, bucket));
    updater.insert(0, new DummyRow(1, 1, 0, bucket));
    updater.insert(0, new DummyRow(2, 2, 0, bucket));
    updater.insert(10000001, new DummyRow(3, 0, 10000001, bucket));
    updater.close(false);
    // delete 3rd row
    options.writingBase(false).minimumWriteId(10000004).maximumWriteId(10000004);
    updater = new OrcRecordUpdater(root, options);
    updater.delete(options.getMinimumWriteId(), new DummyRow(-1, 0, 0, bucket));
    // hypothetically this matches something in (nonexistent here)
    // delta_10000003_10000003
    updater.delete(options.getMinimumWriteId(), new DummyRow(-1, 5, 10000003, bucket));
    updater.close(false);
    conf.set(ValidTxnList.VALID_TXNS_KEY, new ValidReadTxnList(new long[0], new BitSet(), 1000, Long.MAX_VALUE).writeToString());
    // HWM is not important - just make sure deltas created above are read as
    // if committed
    conf.set(ValidWriteIdList.VALID_WRITEIDS_KEY, "tbl:10000005:" + Long.MAX_VALUE + "::");
    List<OrcInputFormat.SplitStrategy<?>> splitStrategies = getSplitStrategies();
    assertEquals(1, splitStrategies.size());
    List<OrcSplit> splits = ((OrcInputFormat.ACIDSplitStrategy) splitStrategies.get(0)).getSplits();
    assertEquals(1, splits.size());
    assertEquals(root.toUri().toString() + File.separator + "base_10000002/bucket_00001", splits.get(0).getPath().toUri().toString());
    assertFalse(splits.get(0).isOriginal());
    VectorizedOrcAcidRowBatchReader vectorizedReader = new VectorizedOrcAcidRowBatchReader(splits.get(0), conf, Reporter.NULL, new VectorizedRowBatchCtx());
    ColumnizedDeleteEventRegistry deleteEventRegistry = (ColumnizedDeleteEventRegistry) vectorizedReader.getDeleteEventRegistry();
    assertEquals("number of delete events for stripe 1", filterOn ? 1 : 2, deleteEventRegistry.size());
    OrcRawRecordMerger.KeyInterval keyInterval = vectorizedReader.getKeyInterval();
    SearchArgument sarg = vectorizedReader.getDeleteEventSarg();
    if (filterOn) {
        if (skipKeyIdx) {
            // If key index is not present, the min max key interval uses stripe stats instead
            assertEquals(new OrcRawRecordMerger.KeyInterval(new RecordIdentifier(0, bucketProperty, 0), new RecordIdentifier(10000001, bucketProperty, 2)), keyInterval);
        } else {
            assertEquals(new OrcRawRecordMerger.KeyInterval(new RecordIdentifier(0, bucketProperty, 0), new RecordIdentifier(10000001, bucketProperty, 0)), keyInterval);
        }
        // key point is that in leaf-5 is (rowId <= 2) even though maxKey has
        // rowId 0.  more in VectorizedOrcAcidRowBatchReader.findMinMaxKeys
        assertEquals("leaf-0 = (LESS_THAN originalTransaction 0)," + " leaf-1 = (LESS_THAN bucket 536936448)," + " leaf-2 = (LESS_THAN rowId 0)," + " leaf-3 = (LESS_THAN_EQUALS originalTransaction 10000001)," + " leaf-4 = (LESS_THAN_EQUALS bucket 536936448)," + " leaf-5 = (LESS_THAN_EQUALS rowId 2)," + " expr = (and (not leaf-0) (not leaf-1) " + "(not leaf-2) leaf-3 leaf-4 leaf-5)", ((SearchArgumentImpl) sarg).toOldString());
    } else {
        assertEquals(new OrcRawRecordMerger.KeyInterval(null, null), keyInterval);
        assertNull(sarg);
    }
}
Also used : BitSet(java.util.BitSet) SearchArgument(org.apache.hadoop.hive.ql.io.sarg.SearchArgument) ValidReadTxnList(org.apache.hadoop.hive.common.ValidReadTxnList) AcidOutputFormat(org.apache.hadoop.hive.ql.io.AcidOutputFormat) RecordIdentifier(org.apache.hadoop.hive.ql.io.RecordIdentifier) VectorizedRowBatchCtx(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx) RecordUpdater(org.apache.hadoop.hive.ql.io.RecordUpdater) ColumnizedDeleteEventRegistry(org.apache.hadoop.hive.ql.io.orc.VectorizedOrcAcidRowBatchReader.ColumnizedDeleteEventRegistry)

Example 28 with VectorizedRowBatchCtx

use of org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx in project hive by apache.

the class TestVectorizedOrcAcidRowBatchReader method validateKeyInterval.

private void validateKeyInterval(OrcSplit split, RecordIdentifier lowKey, RecordIdentifier highKey, boolean filterOn) throws Exception {
    VectorizedOrcAcidRowBatchReader vectorizedReader = new VectorizedOrcAcidRowBatchReader(split, conf, Reporter.NULL, new VectorizedRowBatchCtx());
    OrcRawRecordMerger.KeyInterval keyInterval = vectorizedReader.getKeyInterval();
    SearchArgument sarg = vectorizedReader.getDeleteEventSarg();
    if (filterOn) {
        assertEquals(new OrcRawRecordMerger.KeyInterval(lowKey, highKey), keyInterval);
    } else {
        assertEquals(new OrcRawRecordMerger.KeyInterval(null, null), keyInterval);
        assertNull(sarg);
    }
}
Also used : VectorizedRowBatchCtx(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx) SearchArgument(org.apache.hadoop.hive.ql.io.sarg.SearchArgument)

Example 29 with VectorizedRowBatchCtx

use of org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx in project hive by apache.

the class TestVectorIndex method doVectorCastTest.

private boolean doVectorCastTest(TypeInfo typeInfo, List<String> columns, String[] columnNames, TypeInfo[] typeInfos, DataTypePhysicalVariation[] dataTypePhysicalVariations, List<ExprNodeDesc> children, GenericUDF udf, ExprNodeGenericFuncDesc exprDesc, IndexTestMode indexTestMode, VectorRandomBatchSource batchSource, ObjectInspector objectInspector, TypeInfo outputTypeInfo, Object[] resultObjects) throws Exception {
    HiveConf hiveConf = new HiveConf();
    if (indexTestMode == IndexTestMode.ADAPTOR) {
        hiveConf.setBoolVar(HiveConf.ConfVars.HIVE_TEST_VECTOR_ADAPTOR_OVERRIDE, true);
    }
    VectorizationContext vectorizationContext = new VectorizationContext("name", columns, Arrays.asList(typeInfos), Arrays.asList(dataTypePhysicalVariations), hiveConf);
    VectorExpression vectorExpression = vectorizationContext.getVectorExpression(exprDesc, VectorExpressionDescriptor.Mode.PROJECTION);
    vectorExpression.transientInit(hiveConf);
    if (indexTestMode == IndexTestMode.VECTOR_EXPRESSION && vectorExpression instanceof VectorUDFAdaptor) {
        System.out.println("*NO NATIVE VECTOR EXPRESSION* typeInfo " + typeInfo.toString() + " indexTestMode " + indexTestMode + " vectorExpression " + vectorExpression.toString());
    }
    System.out.println("*VECTOR EXPRESSION* " + vectorExpression.getClass().getSimpleName());
    /*
    System.out.println(
        "*DEBUG* typeInfo " + typeInfo.toString() +
        " indexTestMode " + indexTestMode +
        " vectorExpression " + vectorExpression.toString());
    */
    VectorRandomRowSource rowSource = batchSource.getRowSource();
    VectorizedRowBatchCtx batchContext = new VectorizedRowBatchCtx(columnNames, rowSource.typeInfos(), rowSource.dataTypePhysicalVariations(), /* dataColumnNums */
    null, /* partitionColumnCount */
    0, /* virtualColumnCount */
    0, /* neededVirtualColumns */
    null, vectorizationContext.getScratchColumnTypeNames(), vectorizationContext.getScratchDataTypePhysicalVariations());
    VectorizedRowBatch batch = batchContext.createVectorizedRowBatch();
    VectorExtractRow resultVectorExtractRow = new VectorExtractRow();
    resultVectorExtractRow.init(new TypeInfo[] { outputTypeInfo }, new int[] { vectorExpression.getOutputColumnNum() });
    Object[] scrqtchRow = new Object[1];
    /*
    System.out.println(
        "*DEBUG* typeInfo1 " + typeInfo1.toString() +
        " typeInfo2 " + typeInfo2.toString() +
        " arithmeticTestMode " + arithmeticTestMode +
        " columnScalarMode " + columnScalarMode +
        " vectorExpression " + vectorExpression.toString());
    */
    batchSource.resetBatchIteration();
    int rowIndex = 0;
    while (true) {
        if (!batchSource.fillNextBatch(batch)) {
            break;
        }
        vectorExpression.evaluate(batch);
        extractResultObjects(batch, rowIndex, resultVectorExtractRow, scrqtchRow, objectInspector, resultObjects);
        rowIndex += batch.size;
    }
    return true;
}
Also used : VectorizedRowBatchCtx(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx) VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) HiveConf(org.apache.hadoop.hive.conf.HiveConf) VectorizationContext(org.apache.hadoop.hive.ql.exec.vector.VectorizationContext) VectorUDFAdaptor(org.apache.hadoop.hive.ql.exec.vector.udf.VectorUDFAdaptor) VectorExtractRow(org.apache.hadoop.hive.ql.exec.vector.VectorExtractRow) VectorRandomRowSource(org.apache.hadoop.hive.ql.exec.vector.VectorRandomRowSource)

Example 30 with VectorizedRowBatchCtx

use of org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx in project hive by apache.

the class TestVectorCastStatement method doVectorCastTest.

private boolean doVectorCastTest(TypeInfo typeInfo, TypeInfo targetTypeInfo, List<String> columns, String[] columnNames, TypeInfo[] typeInfos, DataTypePhysicalVariation[] dataTypePhysicalVariations, List<ExprNodeDesc> children, CastStmtTestMode castStmtTestMode, VectorRandomBatchSource batchSource, Object[] resultObjects) throws Exception {
    GenericUDF udf;
    try {
        udf = VectorizationContext.getGenericUDFForCast(targetTypeInfo);
    } catch (HiveException e) {
        return false;
    }
    ExprNodeGenericFuncDesc exprDesc = new ExprNodeGenericFuncDesc(targetTypeInfo, udf, children);
    HiveConf hiveConf = new HiveConf();
    if (castStmtTestMode == CastStmtTestMode.ADAPTOR) {
        hiveConf.setBoolVar(HiveConf.ConfVars.HIVE_TEST_VECTOR_ADAPTOR_OVERRIDE, true);
    }
    VectorizationContext vectorizationContext = new VectorizationContext("name", columns, Arrays.asList(typeInfos), Arrays.asList(dataTypePhysicalVariations), hiveConf);
    VectorExpression vectorExpression = vectorizationContext.getVectorExpression(exprDesc);
    vectorExpression.transientInit(hiveConf);
    if (castStmtTestMode == CastStmtTestMode.VECTOR_EXPRESSION && vectorExpression instanceof VectorUDFAdaptor) {
        System.out.println("*NO NATIVE VECTOR EXPRESSION* typeInfo " + typeInfo.toString() + " castStmtTestMode " + castStmtTestMode + " vectorExpression " + vectorExpression.toString());
    }
    // System.out.println("*VECTOR EXPRESSION* " + vectorExpression.getClass().getSimpleName());
    /*
    System.out.println(
        "*DEBUG* typeInfo " + typeInfo.toString() +
        " targetTypeInfo " + targetTypeInfo +
        " castStmtTestMode " + castStmtTestMode +
        " vectorExpression " + vectorExpression.toString());
    */
    VectorRandomRowSource rowSource = batchSource.getRowSource();
    VectorizedRowBatchCtx batchContext = new VectorizedRowBatchCtx(columnNames, rowSource.typeInfos(), rowSource.dataTypePhysicalVariations(), /* dataColumnNums */
    null, /* partitionColumnCount */
    0, /* virtualColumnCount */
    0, /* neededVirtualColumns */
    null, vectorizationContext.getScratchColumnTypeNames(), vectorizationContext.getScratchDataTypePhysicalVariations());
    VectorizedRowBatch batch = batchContext.createVectorizedRowBatch();
    VectorExtractRow resultVectorExtractRow = new VectorExtractRow();
    resultVectorExtractRow.init(new TypeInfo[] { targetTypeInfo }, new int[] { vectorExpression.getOutputColumnNum() });
    Object[] scrqtchRow = new Object[1];
    batchSource.resetBatchIteration();
    int rowIndex = 0;
    while (true) {
        if (!batchSource.fillNextBatch(batch)) {
            break;
        }
        vectorExpression.evaluate(batch);
        extractResultObjects(batch, rowIndex, resultVectorExtractRow, scrqtchRow, resultObjects);
        rowIndex += batch.size;
    }
    return true;
}
Also used : HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) ExprNodeGenericFuncDesc(org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc) VectorizationContext(org.apache.hadoop.hive.ql.exec.vector.VectorizationContext) VectorUDFAdaptor(org.apache.hadoop.hive.ql.exec.vector.udf.VectorUDFAdaptor) VectorExtractRow(org.apache.hadoop.hive.ql.exec.vector.VectorExtractRow) VectorizedRowBatchCtx(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx) VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) GenericUDF(org.apache.hadoop.hive.ql.udf.generic.GenericUDF) HiveConf(org.apache.hadoop.hive.conf.HiveConf) VectorExpression(org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression) VectorRandomRowSource(org.apache.hadoop.hive.ql.exec.vector.VectorRandomRowSource)

Aggregations

VectorizedRowBatchCtx (org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx)34 VectorizedRowBatch (org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch)14 HiveConf (org.apache.hadoop.hive.conf.HiveConf)12 VectorExtractRow (org.apache.hadoop.hive.ql.exec.vector.VectorExtractRow)12 VectorRandomRowSource (org.apache.hadoop.hive.ql.exec.vector.VectorRandomRowSource)12 VectorizationContext (org.apache.hadoop.hive.ql.exec.vector.VectorizationContext)12 VectorUDFAdaptor (org.apache.hadoop.hive.ql.exec.vector.udf.VectorUDFAdaptor)11 DataTypePhysicalVariation (org.apache.hadoop.hive.common.type.DataTypePhysicalVariation)10 TypeInfo (org.apache.hadoop.hive.serde2.typeinfo.TypeInfo)10 ArrayList (java.util.ArrayList)9 VectorExpression (org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression)8 DecimalTypeInfo (org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo)8 PrimitiveTypeInfo (org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo)8 GenericUDF (org.apache.hadoop.hive.ql.udf.generic.GenericUDF)7 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)6 VectorRandomBatchSource (org.apache.hadoop.hive.ql.exec.vector.VectorRandomBatchSource)5 GenerationSpec (org.apache.hadoop.hive.ql.exec.vector.VectorRandomRowSource.GenerationSpec)5 AcidOutputFormat (org.apache.hadoop.hive.ql.io.AcidOutputFormat)5 ExprNodeColumnDesc (org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc)5 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)5