Examples with LazyBinary - org.apache.hadoop.hive.serde2.lazy.LazyBinary

Example 1 with LazyBinary

use of org.apache.hadoop.hive.serde2.lazy.LazyBinary in project hive by apache.

the class Vectorizer method canSpecializeMapJoin.

private boolean canSpecializeMapJoin(Operator<? extends OperatorDesc> op, MapJoinDesc desc, boolean isTezOrSpark, VectorizationContext vContext, VectorMapJoinInfo vectorMapJoinInfo) throws HiveException {
    Preconditions.checkState(op instanceof MapJoinOperator);
    // Allocate a VectorReduceSinkDesc initially with implementation type NONE so EXPLAIN
    // can report this operator was vectorized, but not native.  And, the conditions.
    VectorMapJoinDesc vectorDesc = new VectorMapJoinDesc();
    desc.setVectorDesc(vectorDesc);
    boolean isVectorizationMapJoinNativeEnabled = HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_VECTORIZATION_MAPJOIN_NATIVE_ENABLED);
    String engine = HiveConf.getVar(hiveConf, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE);
    boolean oneMapJoinCondition = (desc.getConds().length == 1);
    boolean hasNullSafes = onExpressionHasNullSafes(desc);
    byte posBigTable = (byte) desc.getPosBigTable();
    // Since we want to display all the met and not met conditions in EXPLAIN, we determine all
    // information first....
    List<ExprNodeDesc> keyDesc = desc.getKeys().get(posBigTable);
    VectorExpression[] allBigTableKeyExpressions = vContext.getVectorExpressions(keyDesc);
    final int allBigTableKeyExpressionsLength = allBigTableKeyExpressions.length;
    // Assume.
    boolean supportsKeyTypes = true;
    HashSet<String> notSupportedKeyTypes = new HashSet<String>();
    // Since a key expression can be a calculation and the key will go into a scratch column,
    // we need the mapping and type information.
    int[] bigTableKeyColumnMap = new int[allBigTableKeyExpressionsLength];
    String[] bigTableKeyColumnNames = new String[allBigTableKeyExpressionsLength];
    TypeInfo[] bigTableKeyTypeInfos = new TypeInfo[allBigTableKeyExpressionsLength];
    ArrayList<VectorExpression> bigTableKeyExpressionsList = new ArrayList<VectorExpression>();
    VectorExpression[] bigTableKeyExpressions;
    for (int i = 0; i < allBigTableKeyExpressionsLength; i++) {
        VectorExpression ve = allBigTableKeyExpressions[i];
        if (!IdentityExpression.isColumnOnly(ve)) {
            bigTableKeyExpressionsList.add(ve);
        }
        bigTableKeyColumnMap[i] = ve.getOutputColumn();
        ExprNodeDesc exprNode = keyDesc.get(i);
        bigTableKeyColumnNames[i] = exprNode.toString();
        TypeInfo typeInfo = exprNode.getTypeInfo();
        // same check used in HashTableLoader.
        if (!MapJoinKey.isSupportedField(typeInfo)) {
            supportsKeyTypes = false;
            Category category = typeInfo.getCategory();
            notSupportedKeyTypes.add((category != Category.PRIMITIVE ? category.toString() : ((PrimitiveTypeInfo) typeInfo).getPrimitiveCategory().toString()));
        }
        bigTableKeyTypeInfos[i] = typeInfo;
    }
    if (bigTableKeyExpressionsList.size() == 0) {
        bigTableKeyExpressions = null;
    } else {
        bigTableKeyExpressions = bigTableKeyExpressionsList.toArray(new VectorExpression[0]);
    }
    List<ExprNodeDesc> bigTableExprs = desc.getExprs().get(posBigTable);
    VectorExpression[] allBigTableValueExpressions = vContext.getVectorExpressions(bigTableExprs);
    boolean isFastHashTableEnabled = HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_VECTORIZATION_MAPJOIN_NATIVE_FAST_HASHTABLE_ENABLED);
    // Especially since LLAP is prone to turn it off in the MapJoinDesc in later
    // physical optimizer stages...
    boolean isHybridHashJoin = desc.isHybridHashJoin();
    /*
     * Populate vectorMapJoininfo.
     */
    /*
     * Similarly, we need a mapping since a value expression can be a calculation and the value
     * will go into a scratch column.
     */
    int[] bigTableValueColumnMap = new int[allBigTableValueExpressions.length];
    String[] bigTableValueColumnNames = new String[allBigTableValueExpressions.length];
    TypeInfo[] bigTableValueTypeInfos = new TypeInfo[allBigTableValueExpressions.length];
    ArrayList<VectorExpression> bigTableValueExpressionsList = new ArrayList<VectorExpression>();
    VectorExpression[] bigTableValueExpressions;
    for (int i = 0; i < bigTableValueColumnMap.length; i++) {
        VectorExpression ve = allBigTableValueExpressions[i];
        if (!IdentityExpression.isColumnOnly(ve)) {
            bigTableValueExpressionsList.add(ve);
        }
        bigTableValueColumnMap[i] = ve.getOutputColumn();
        ExprNodeDesc exprNode = bigTableExprs.get(i);
        bigTableValueColumnNames[i] = exprNode.toString();
        bigTableValueTypeInfos[i] = exprNode.getTypeInfo();
    }
    if (bigTableValueExpressionsList.size() == 0) {
        bigTableValueExpressions = null;
    } else {
        bigTableValueExpressions = bigTableValueExpressionsList.toArray(new VectorExpression[0]);
    }
    vectorMapJoinInfo.setBigTableKeyColumnMap(bigTableKeyColumnMap);
    vectorMapJoinInfo.setBigTableKeyColumnNames(bigTableKeyColumnNames);
    vectorMapJoinInfo.setBigTableKeyTypeInfos(bigTableKeyTypeInfos);
    vectorMapJoinInfo.setBigTableKeyExpressions(bigTableKeyExpressions);
    vectorMapJoinInfo.setBigTableValueColumnMap(bigTableValueColumnMap);
    vectorMapJoinInfo.setBigTableValueColumnNames(bigTableValueColumnNames);
    vectorMapJoinInfo.setBigTableValueTypeInfos(bigTableValueTypeInfos);
    vectorMapJoinInfo.setBigTableValueExpressions(bigTableValueExpressions);
    /*
     * Small table information.
     */
    VectorColumnOutputMapping bigTableRetainedMapping = new VectorColumnOutputMapping("Big Table Retained Mapping");
    VectorColumnOutputMapping bigTableOuterKeyMapping = new VectorColumnOutputMapping("Big Table Outer Key Mapping");
    // The order of the fields in the LazyBinary small table value must be used, so
    // we use the source ordering flavor for the mapping.
    VectorColumnSourceMapping smallTableMapping = new VectorColumnSourceMapping("Small Table Mapping");
    Byte[] order = desc.getTagOrder();
    Byte posSingleVectorMapJoinSmallTable = (order[0] == posBigTable ? order[1] : order[0]);
    boolean isOuterJoin = !desc.getNoOuterJoin();
    /*
     * Gather up big and small table output result information from the MapJoinDesc.
     */
    List<Integer> bigTableRetainList = desc.getRetainList().get(posBigTable);
    int bigTableRetainSize = bigTableRetainList.size();
    int[] smallTableIndices;
    int smallTableIndicesSize;
    List<ExprNodeDesc> smallTableExprs = desc.getExprs().get(posSingleVectorMapJoinSmallTable);
    if (desc.getValueIndices() != null && desc.getValueIndices().get(posSingleVectorMapJoinSmallTable) != null) {
        smallTableIndices = desc.getValueIndices().get(posSingleVectorMapJoinSmallTable);
        smallTableIndicesSize = smallTableIndices.length;
    } else {
        smallTableIndices = null;
        smallTableIndicesSize = 0;
    }
    List<Integer> smallTableRetainList = desc.getRetainList().get(posSingleVectorMapJoinSmallTable);
    int smallTableRetainSize = smallTableRetainList.size();
    int smallTableResultSize = 0;
    if (smallTableIndicesSize > 0) {
        smallTableResultSize = smallTableIndicesSize;
    } else if (smallTableRetainSize > 0) {
        smallTableResultSize = smallTableRetainSize;
    }
    /*
     * Determine the big table retained mapping first so we can optimize out (with
     * projection) copying inner join big table keys in the subsequent small table results section.
     */
    // We use a mapping object here so we can build the projection in any order and
    // get the ordered by 0 to n-1 output columns at the end.
    //
    // Also, to avoid copying a big table key into the small table result area for inner joins,
    // we reference it with the projection so there can be duplicate output columns
    // in the projection.
    VectorColumnSourceMapping projectionMapping = new VectorColumnSourceMapping("Projection Mapping");
    int nextOutputColumn = (order[0] == posBigTable ? 0 : smallTableResultSize);
    for (int i = 0; i < bigTableRetainSize; i++) {
        // Since bigTableValueExpressions may do a calculation and produce a scratch column, we
        // need to map to the right batch column.
        int retainColumn = bigTableRetainList.get(i);
        int batchColumnIndex = bigTableValueColumnMap[retainColumn];
        TypeInfo typeInfo = bigTableValueTypeInfos[i];
        // With this map we project the big table batch to make it look like an output batch.
        projectionMapping.add(nextOutputColumn, batchColumnIndex, typeInfo);
        // Collect columns we copy from the big table batch to the overflow batch.
        if (!bigTableRetainedMapping.containsOutputColumn(batchColumnIndex)) {
            // Tolerate repeated use of a big table column.
            bigTableRetainedMapping.add(batchColumnIndex, batchColumnIndex, typeInfo);
        }
        nextOutputColumn++;
    }
    /*
     * Now determine the small table results.
     */
    boolean smallTableExprVectorizes = true;
    int firstSmallTableOutputColumn;
    firstSmallTableOutputColumn = (order[0] == posBigTable ? bigTableRetainSize : 0);
    int smallTableOutputCount = 0;
    nextOutputColumn = firstSmallTableOutputColumn;
    // Small table indices has more information (i.e. keys) than retain, so use it if it exists...
    String[] bigTableRetainedNames;
    if (smallTableIndicesSize > 0) {
        smallTableOutputCount = smallTableIndicesSize;
        bigTableRetainedNames = new String[smallTableOutputCount];
        for (int i = 0; i < smallTableIndicesSize; i++) {
            if (smallTableIndices[i] >= 0) {
                // Zero and above numbers indicate a big table key is needed for
                // small table result "area".
                int keyIndex = smallTableIndices[i];
                // Since bigTableKeyExpressions may do a calculation and produce a scratch column, we
                // need to map the right column.
                int batchKeyColumn = bigTableKeyColumnMap[keyIndex];
                bigTableRetainedNames[i] = bigTableKeyColumnNames[keyIndex];
                TypeInfo typeInfo = bigTableKeyTypeInfos[keyIndex];
                if (!isOuterJoin) {
                    // Optimize inner join keys of small table results.
                    // Project the big table key into the small table result "area".
                    projectionMapping.add(nextOutputColumn, batchKeyColumn, typeInfo);
                    if (!bigTableRetainedMapping.containsOutputColumn(batchKeyColumn)) {
                        // If necessary, copy the big table key into the overflow batch's small table
                        // result "area".
                        bigTableRetainedMapping.add(batchKeyColumn, batchKeyColumn, typeInfo);
                    }
                } else {
                    // For outer joins, since the small table key can be null when there is no match,
                    // we must have a physical (scratch) column for those keys.  We cannot use the
                    // projection optimization used by inner joins above.
                    int scratchColumn = vContext.allocateScratchColumn(typeInfo);
                    projectionMapping.add(nextOutputColumn, scratchColumn, typeInfo);
                    bigTableRetainedMapping.add(batchKeyColumn, scratchColumn, typeInfo);
                    bigTableOuterKeyMapping.add(batchKeyColumn, scratchColumn, typeInfo);
                }
            } else {
                // Negative numbers indicate a column to be (deserialize) read from the small table's
                // LazyBinary value row.
                int smallTableValueIndex = -smallTableIndices[i] - 1;
                ExprNodeDesc smallTableExprNode = smallTableExprs.get(i);
                if (!validateExprNodeDesc(smallTableExprNode, "Small Table")) {
                    clearNotVectorizedReason();
                    smallTableExprVectorizes = false;
                }
                bigTableRetainedNames[i] = smallTableExprNode.toString();
                TypeInfo typeInfo = smallTableExprNode.getTypeInfo();
                // Make a new big table scratch column for the small table value.
                int scratchColumn = vContext.allocateScratchColumn(typeInfo);
                projectionMapping.add(nextOutputColumn, scratchColumn, typeInfo);
                smallTableMapping.add(smallTableValueIndex, scratchColumn, typeInfo);
            }
            nextOutputColumn++;
        }
    } else if (smallTableRetainSize > 0) {
        smallTableOutputCount = smallTableRetainSize;
        bigTableRetainedNames = new String[smallTableOutputCount];
        for (int i = 0; i < smallTableRetainSize; i++) {
            int smallTableValueIndex = smallTableRetainList.get(i);
            ExprNodeDesc smallTableExprNode = smallTableExprs.get(i);
            if (!validateExprNodeDesc(smallTableExprNode, "Small Table")) {
                clearNotVectorizedReason();
                smallTableExprVectorizes = false;
            }
            bigTableRetainedNames[i] = smallTableExprNode.toString();
            // Make a new big table scratch column for the small table value.
            TypeInfo typeInfo = smallTableExprNode.getTypeInfo();
            int scratchColumn = vContext.allocateScratchColumn(typeInfo);
            projectionMapping.add(nextOutputColumn, scratchColumn, typeInfo);
            smallTableMapping.add(smallTableValueIndex, scratchColumn, typeInfo);
            nextOutputColumn++;
        }
    } else {
        bigTableRetainedNames = new String[0];
    }
    boolean useOptimizedTable = HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEMAPJOINUSEOPTIMIZEDTABLE);
    // Remember the condition variables for EXPLAIN regardless of whether we specialize or not.
    vectorDesc.setUseOptimizedTable(useOptimizedTable);
    vectorDesc.setIsVectorizationMapJoinNativeEnabled(isVectorizationMapJoinNativeEnabled);
    vectorDesc.setEngine(engine);
    vectorDesc.setOneMapJoinCondition(oneMapJoinCondition);
    vectorDesc.setHasNullSafes(hasNullSafes);
    vectorDesc.setSmallTableExprVectorizes(smallTableExprVectorizes);
    vectorDesc.setIsFastHashTableEnabled(isFastHashTableEnabled);
    vectorDesc.setIsHybridHashJoin(isHybridHashJoin);
    vectorDesc.setSupportsKeyTypes(supportsKeyTypes);
    if (!supportsKeyTypes) {
        vectorDesc.setNotSupportedKeyTypes(new ArrayList(notSupportedKeyTypes));
    }
    // Check common conditions for both Optimized and Fast Hash Tables.
    // Assume.
    boolean result = true;
    if (!useOptimizedTable || !isVectorizationMapJoinNativeEnabled || !isTezOrSpark || !oneMapJoinCondition || hasNullSafes || !smallTableExprVectorizes) {
        result = false;
    }
    if (!isFastHashTableEnabled) {
        // Check optimized-only hash table restrictions.
        if (!supportsKeyTypes) {
            result = false;
        }
    } else {
        if (isHybridHashJoin) {
            result = false;
        }
    }
    // Convert dynamic arrays and maps to simple arrays.
    bigTableRetainedMapping.finalize();
    bigTableOuterKeyMapping.finalize();
    smallTableMapping.finalize();
    vectorMapJoinInfo.setBigTableRetainedMapping(bigTableRetainedMapping);
    vectorMapJoinInfo.setBigTableOuterKeyMapping(bigTableOuterKeyMapping);
    vectorMapJoinInfo.setSmallTableMapping(smallTableMapping);
    projectionMapping.finalize();
    // Verify we added an entry for each output.
    assert projectionMapping.isSourceSequenceGood();
    vectorMapJoinInfo.setProjectionMapping(projectionMapping);
    return result;
}

Also used : VectorMapJoinDesc(org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc) PrimitiveCategory(org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory) Category(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category) ArrayList(java.util.ArrayList) VectorColumnOutputMapping(org.apache.hadoop.hive.ql.exec.vector.VectorColumnOutputMapping) UDFToString(org.apache.hadoop.hive.ql.udf.UDFToString) PrimitiveTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo) VectorColumnSourceMapping(org.apache.hadoop.hive.ql.exec.vector.VectorColumnSourceMapping) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) HashSet(java.util.HashSet) VectorMapJoinOperator(org.apache.hadoop.hive.ql.exec.vector.VectorMapJoinOperator) StructTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo) PrimitiveTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) UDFToInteger(org.apache.hadoop.hive.ql.udf.UDFToInteger) UDFToByte(org.apache.hadoop.hive.ql.udf.UDFToByte) VectorExpression(org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression)

Example 2 with LazyBinary

use of org.apache.hadoop.hive.serde2.lazy.LazyBinary in project hive by apache.

the class TestLazyBinaryFast method testLazyBinaryFastCase.

public void testLazyBinaryFastCase(int caseNum, boolean doNonRandomFill, Random r) throws Throwable {
    SerdeRandomRowSource source = new SerdeRandomRowSource();
    source.init(r);
    int rowCount = 1000;
    Object[][] rows = source.randomRows(rowCount);
    if (doNonRandomFill) {
        MyTestClass.nonRandomRowFill(rows, source.primitiveCategories());
    }
    StructObjectInspector rowStructObjectInspector = source.rowStructObjectInspector();
    PrimitiveTypeInfo[] primitiveTypeInfos = source.primitiveTypeInfos();
    int columnCount = primitiveTypeInfos.length;
    int writeColumnCount = columnCount;
    StructObjectInspector writeRowStructObjectInspector = rowStructObjectInspector;
    boolean doWriteFewerColumns = r.nextBoolean();
    if (doWriteFewerColumns) {
        writeColumnCount = 1 + r.nextInt(columnCount);
        if (writeColumnCount == columnCount) {
            doWriteFewerColumns = false;
        } else {
            writeRowStructObjectInspector = source.partialRowStructObjectInspector(writeColumnCount);
        }
    }
    String fieldNames = ObjectInspectorUtils.getFieldNames(rowStructObjectInspector);
    String fieldTypes = ObjectInspectorUtils.getFieldTypes(rowStructObjectInspector);
    AbstractSerDe serde = TestLazyBinarySerDe.getSerDe(fieldNames, fieldTypes);
    AbstractSerDe serde_fewer = null;
    if (doWriteFewerColumns) {
        String partialFieldNames = ObjectInspectorUtils.getFieldNames(writeRowStructObjectInspector);
        String partialFieldTypes = ObjectInspectorUtils.getFieldTypes(writeRowStructObjectInspector);
        serde_fewer = TestLazyBinarySerDe.getSerDe(partialFieldNames, partialFieldTypes);
        ;
    }
    testLazyBinaryFast(source, rows, serde, rowStructObjectInspector, serde_fewer, writeRowStructObjectInspector, primitiveTypeInfos, /* useIncludeColumns */
    false, /* doWriteFewerColumns */
    false, r);
    testLazyBinaryFast(source, rows, serde, rowStructObjectInspector, serde_fewer, writeRowStructObjectInspector, primitiveTypeInfos, /* useIncludeColumns */
    true, /* doWriteFewerColumns */
    false, r);
/*
     * Can the LazyBinary format really tolerate writing fewer columns?
     */
// if (doWriteFewerColumns) {
//   testLazyBinaryFast(
//       source, rows,
//       serde, rowStructObjectInspector,
//       serde_fewer, writeRowStructObjectInspector,
//       primitiveTypeInfos,
//       /* useIncludeColumns */ false, /* doWriteFewerColumns */ true, r);
//   testLazyBinaryFast(
//       source, rows,
//       serde, rowStructObjectInspector,
//       serde_fewer, writeRowStructObjectInspector,
//       primitiveTypeInfos,
//       /* useIncludeColumns */ true, /* doWriteFewerColumns */ true, r);
// }
}

Also used : SerdeRandomRowSource(org.apache.hadoop.hive.serde2.SerdeRandomRowSource) PrimitiveTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo) AbstractSerDe(org.apache.hadoop.hive.serde2.AbstractSerDe) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)

Example 3 with LazyBinary

use of org.apache.hadoop.hive.serde2.lazy.LazyBinary in project hive by apache.

the class TestLazyBinaryFast method testLazyBinaryFast.

private void testLazyBinaryFast(SerdeRandomRowSource source, Object[][] rows, AbstractSerDe serde, StructObjectInspector rowOI, AbstractSerDe serde_fewer, StructObjectInspector writeRowOI, PrimitiveTypeInfo[] primitiveTypeInfos, boolean useIncludeColumns, boolean doWriteFewerColumns, Random r) throws Throwable {
    int rowCount = rows.length;
    int columnCount = primitiveTypeInfos.length;
    boolean[] columnsToInclude = null;
    if (useIncludeColumns) {
        columnsToInclude = new boolean[columnCount];
        for (int i = 0; i < columnCount; i++) {
            columnsToInclude[i] = r.nextBoolean();
        }
    }
    int writeColumnCount = columnCount;
    PrimitiveTypeInfo[] writePrimitiveTypeInfos = primitiveTypeInfos;
    if (doWriteFewerColumns) {
        writeColumnCount = writeRowOI.getAllStructFieldRefs().size();
        writePrimitiveTypeInfos = Arrays.copyOf(primitiveTypeInfos, writeColumnCount);
    }
    LazyBinarySerializeWrite lazyBinarySerializeWrite = new LazyBinarySerializeWrite(writeColumnCount);
    // Try to serialize
    BytesWritable[] serializeWriteBytes = new BytesWritable[rowCount];
    for (int i = 0; i < rowCount; i++) {
        Object[] row = rows[i];
        Output output = new Output();
        lazyBinarySerializeWrite.set(output);
        for (int index = 0; index < writeColumnCount; index++) {
            Writable writable = (Writable) row[index];
            VerifyFast.serializeWrite(lazyBinarySerializeWrite, primitiveTypeInfos[index], writable);
        }
        BytesWritable bytesWritable = new BytesWritable();
        bytesWritable.set(output.getData(), 0, output.getLength());
        serializeWriteBytes[i] = bytesWritable;
    }
    // Try to deserialize
    for (int i = 0; i < rowCount; i++) {
        Object[] row = rows[i];
        // Specifying the right type info length tells LazyBinaryDeserializeRead which is the last
        // column.
        LazyBinaryDeserializeRead lazyBinaryDeserializeRead = new LazyBinaryDeserializeRead(writePrimitiveTypeInfos, /* useExternalBuffer */
        false);
        BytesWritable bytesWritable = serializeWriteBytes[i];
        lazyBinaryDeserializeRead.set(bytesWritable.getBytes(), 0, bytesWritable.getLength());
        for (int index = 0; index < columnCount; index++) {
            if (useIncludeColumns && !columnsToInclude[index]) {
                lazyBinaryDeserializeRead.skipNextField();
            } else if (index >= writeColumnCount) {
                // Should come back a null.
                VerifyFast.verifyDeserializeRead(lazyBinaryDeserializeRead, primitiveTypeInfos[index], null);
            } else {
                Writable writable = (Writable) row[index];
                VerifyFast.verifyDeserializeRead(lazyBinaryDeserializeRead, primitiveTypeInfos[index], writable);
            }
        }
        if (writeColumnCount == columnCount) {
            TestCase.assertTrue(lazyBinaryDeserializeRead.isEndOfInputReached());
        }
    }
    // Try to deserialize using SerDe class our Writable row objects created by SerializeWrite.
    for (int i = 0; i < rowCount; i++) {
        BytesWritable bytesWritable = serializeWriteBytes[i];
        LazyBinaryStruct lazyBinaryStruct;
        if (doWriteFewerColumns) {
            lazyBinaryStruct = (LazyBinaryStruct) serde_fewer.deserialize(bytesWritable);
        } else {
            lazyBinaryStruct = (LazyBinaryStruct) serde.deserialize(bytesWritable);
        }
        Object[] row = rows[i];
        for (int index = 0; index < writeColumnCount; index++) {
            PrimitiveTypeInfo primitiveTypeInfo = primitiveTypeInfos[index];
            Writable writable = (Writable) row[index];
            Object object = lazyBinaryStruct.getField(index);
            if (writable == null || object == null) {
                if (writable != null || object != null) {
                    fail("SerDe deserialized NULL column mismatch");
                }
            } else {
                if (!object.equals(writable)) {
                    fail("SerDe deserialized value does not match");
                }
            }
        }
    }
    // One Writable per row.
    BytesWritable[] serdeBytes = new BytesWritable[rowCount];
    // Serialize using the SerDe, then below deserialize using DeserializeRead.
    Object[] serdeRow = new Object[writeColumnCount];
    for (int i = 0; i < rowCount; i++) {
        Object[] row = rows[i];
        // LazyBinary seems to work better with an row object array instead of a Java object...
        for (int index = 0; index < writeColumnCount; index++) {
            serdeRow[index] = row[index];
        }
        BytesWritable serialized;
        if (doWriteFewerColumns) {
            serialized = (BytesWritable) serde_fewer.serialize(serdeRow, writeRowOI);
        } else {
            serialized = (BytesWritable) serde.serialize(serdeRow, rowOI);
        }
        BytesWritable bytesWritable = new BytesWritable(Arrays.copyOfRange(serialized.getBytes(), 0, serialized.getLength()));
        byte[] bytes1 = bytesWritable.getBytes();
        BytesWritable lazySerializedWriteBytes = serializeWriteBytes[i];
        byte[] bytes2 = Arrays.copyOfRange(lazySerializedWriteBytes.getBytes(), 0, lazySerializedWriteBytes.getLength());
        if (bytes1.length != bytes2.length) {
            fail("SerializeWrite length " + bytes2.length + " and " + "SerDe serialization length " + bytes1.length + " do not match (" + Arrays.toString(primitiveTypeInfos) + ")");
        }
        if (!Arrays.equals(bytes1, bytes2)) {
            fail("SerializeWrite and SerDe serialization does not match (" + Arrays.toString(primitiveTypeInfos) + ")");
        }
        serdeBytes[i] = bytesWritable;
    }
    // Try to deserialize using DeserializeRead our Writable row objects created by SerDe.
    for (int i = 0; i < rowCount; i++) {
        Object[] row = rows[i];
        // When doWriteFewerColumns, try to read more fields than exist in buffer.
        LazyBinaryDeserializeRead lazyBinaryDeserializeRead = new LazyBinaryDeserializeRead(primitiveTypeInfos, /* useExternalBuffer */
        false);
        BytesWritable bytesWritable = serdeBytes[i];
        lazyBinaryDeserializeRead.set(bytesWritable.getBytes(), 0, bytesWritable.getLength());
        for (int index = 0; index < columnCount; index++) {
            if (useIncludeColumns && !columnsToInclude[index]) {
                lazyBinaryDeserializeRead.skipNextField();
            } else if (index >= writeColumnCount) {
                // Should come back a null.
                VerifyFast.verifyDeserializeRead(lazyBinaryDeserializeRead, primitiveTypeInfos[index], null);
            } else {
                Writable writable = (Writable) row[index];
                VerifyFast.verifyDeserializeRead(lazyBinaryDeserializeRead, primitiveTypeInfos[index], writable);
            }
        }
        if (writeColumnCount == columnCount) {
            TestCase.assertTrue(lazyBinaryDeserializeRead.isEndOfInputReached());
        }
    }
}

Also used : LazyBinarySerializeWrite(org.apache.hadoop.hive.serde2.lazybinary.fast.LazyBinarySerializeWrite) Writable(org.apache.hadoop.io.Writable) BytesWritable(org.apache.hadoop.io.BytesWritable) BytesWritable(org.apache.hadoop.io.BytesWritable) PrimitiveTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo) Output(org.apache.hadoop.hive.serde2.ByteStream.Output) LazyBinaryDeserializeRead(org.apache.hadoop.hive.serde2.lazybinary.fast.LazyBinaryDeserializeRead)

Example 4 with LazyBinary

use of org.apache.hadoop.hive.serde2.lazy.LazyBinary in project hive by apache.

the class GroupByOperator method shouldBeFlushed.

/**
 * Based on user-parameters, should the hash table be flushed.
 *
 * @param newKeys
 *          keys for the row under consideration
 */
private boolean shouldBeFlushed(KeyWrapper newKeys) {
    int numEntries = hashAggregations.size();
    long usedMemory;
    float rate;
    // variable portion of the size every NUMROWSESTIMATESIZE rows.
    if ((numEntriesHashTable == 0) || ((numEntries % NUMROWSESTIMATESIZE) == 0)) {
        // check how much memory left memory
        usedMemory = memoryMXBean.getHeapMemoryUsage().getUsed();
        // TODO: there is no easy and reliable way to compute the memory used by the executor threads and on-heap cache.
        // Assuming the used memory is equally divided among all executors.
        usedMemory = isLlap ? usedMemory / numExecutors : usedMemory;
        rate = (float) usedMemory / (float) maxMemory;
        if (rate > memoryThreshold) {
            return (!isTez || numEntriesHashTable != 0);
        }
        for (Integer pos : keyPositionsSize) {
            Object key = newKeys.getKeyArray()[pos.intValue()];
            // Ignore nulls
            if (key != null) {
                if (key instanceof LazyString) {
                    totalVariableSize += ((LazyPrimitive<LazyStringObjectInspector, Text>) key).getWritableObject().getLength();
                } else if (key instanceof String) {
                    totalVariableSize += ((String) key).length();
                } else if (key instanceof Text) {
                    totalVariableSize += ((Text) key).getLength();
                } else if (key instanceof LazyBinary) {
                    totalVariableSize += ((LazyPrimitive<LazyBinaryObjectInspector, BytesWritable>) key).getWritableObject().getLength();
                } else if (key instanceof BytesWritable) {
                    totalVariableSize += ((BytesWritable) key).getLength();
                } else if (key instanceof ByteArrayRef) {
                    totalVariableSize += ((ByteArrayRef) key).getData().length;
                }
            }
        }
        AggregationBuffer[] aggs = hashAggregations.get(newKeys);
        for (int i = 0; i < aggs.length; i++) {
            AggregationBuffer agg = aggs[i];
            if (estimableAggregationEvaluators[i]) {
                totalVariableSize += ((GenericUDAFEvaluator.AbstractAggregationBuffer) agg).estimate();
                continue;
            }
            if (aggrPositions[i] != null) {
                totalVariableSize += estimateSize(agg, aggrPositions[i]);
            }
        }
        numEntriesVarSize++;
        // Update the number of entries that can fit in the hash table
        numEntriesHashTable = (int) (maxHashTblMemory / (fixedRowSize + (totalVariableSize / numEntriesVarSize)));
        LOG.trace("Hash Aggr: #hash table = {} #max in hash table = {}", numEntries, numEntriesHashTable);
    }
    // flush if necessary
    return (numEntries >= numEntriesHashTable);
}

Also used : GenericUDAFEvaluator(org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator) LazyBinaryObjectInspector(org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyBinaryObjectInspector) Text(org.apache.hadoop.io.Text) BytesWritable(org.apache.hadoop.io.BytesWritable) LazyString(org.apache.hadoop.hive.serde2.lazy.LazyString) LazyPrimitive(org.apache.hadoop.hive.serde2.lazy.LazyPrimitive) LazyBinary(org.apache.hadoop.hive.serde2.lazy.LazyBinary) LazyString(org.apache.hadoop.hive.serde2.lazy.LazyString) ByteArrayRef(org.apache.hadoop.hive.serde2.lazy.ByteArrayRef) UnionObject(org.apache.hadoop.hive.serde2.objectinspector.UnionObject) AggregationBuffer(org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.AggregationBuffer)

Example 5 with LazyBinary

use of org.apache.hadoop.hive.serde2.lazy.LazyBinary in project hive by apache.

the class MapJoinTestConfig method createVectorMapJoinDesc.

public static VectorMapJoinDesc createVectorMapJoinDesc(MapJoinTestDescription testDesc) {
    VectorMapJoinDesc vectorDesc = new VectorMapJoinDesc();
    vectorDesc.setHashTableImplementationType(HashTableImplementationType.FAST);
    HashTableKind hashTableKind;
    switch(testDesc.vectorMapJoinVariation) {
        case INNER:
            hashTableKind = HashTableKind.HASH_MAP;
            break;
        case INNER_BIG_ONLY:
            hashTableKind = HashTableKind.HASH_MULTISET;
            break;
        case LEFT_SEMI:
        case LEFT_ANTI:
            hashTableKind = HashTableKind.HASH_SET;
            break;
        case OUTER:
        case FULL_OUTER:
            hashTableKind = HashTableKind.HASH_MAP;
            break;
        default:
            throw new RuntimeException("unknown operator variation " + testDesc.vectorMapJoinVariation);
    }
    vectorDesc.setHashTableKind(hashTableKind);
    // Assume.
    HashTableKeyType hashTableKeyType = HashTableKeyType.MULTI_KEY;
    if (testDesc.bigTableKeyTypeInfos.length == 1) {
        switch(((PrimitiveTypeInfo) testDesc.bigTableKeyTypeInfos[0]).getPrimitiveCategory()) {
            case BOOLEAN:
                hashTableKeyType = HashTableKeyType.BOOLEAN;
                break;
            case BYTE:
                hashTableKeyType = HashTableKeyType.BYTE;
                break;
            case SHORT:
                hashTableKeyType = HashTableKeyType.SHORT;
                break;
            case INT:
                hashTableKeyType = HashTableKeyType.INT;
                break;
            case LONG:
                hashTableKeyType = HashTableKeyType.LONG;
                break;
            case STRING:
                hashTableKeyType = HashTableKeyType.STRING;
                break;
            default:
        }
    }
    vectorDesc.setHashTableKeyType(hashTableKeyType);
    vectorDesc.setVectorMapJoinVariation(testDesc.vectorMapJoinVariation);
    vectorDesc.setMinMaxEnabled(false);
    VectorMapJoinInfo vectorMapJoinInfo = new VectorMapJoinInfo();
    vectorMapJoinInfo.setBigTableKeyColumnMap(testDesc.bigTableKeyColumnNums);
    vectorMapJoinInfo.setBigTableKeyColumnNames(testDesc.bigTableKeyColumnNames);
    vectorMapJoinInfo.setBigTableKeyTypeInfos(testDesc.bigTableKeyTypeInfos);
    vectorMapJoinInfo.setSlimmedBigTableKeyExpressions(null);
    vectorDesc.setAllBigTableKeyExpressions(null);
    vectorMapJoinInfo.setBigTableValueColumnMap(testDesc.bigTableColumnNums);
    vectorMapJoinInfo.setBigTableValueColumnNames(testDesc.bigTableColumnNames);
    vectorMapJoinInfo.setBigTableValueTypeInfos(testDesc.bigTableTypeInfos);
    vectorMapJoinInfo.setSlimmedBigTableValueExpressions(null);
    vectorDesc.setAllBigTableValueExpressions(null);
    vectorMapJoinInfo.setBigTableFilterExpressions(new VectorExpression[0]);
    /*
     * Column mapping.
     */
    VectorColumnOutputMapping bigTableRetainMapping = new VectorColumnOutputMapping("Big Table Retain Mapping");
    VectorColumnOutputMapping nonOuterSmallTableKeyMapping = new VectorColumnOutputMapping("Non Outer Small Table Key Key Mapping");
    VectorColumnOutputMapping outerSmallTableKeyMapping = new VectorColumnOutputMapping("Outer Small Table Key Mapping");
    VectorColumnSourceMapping fullOuterSmallTableKeyMapping = new VectorColumnSourceMapping("Full Outer Small Table Key Mapping");
    VectorColumnSourceMapping projectionMapping = new VectorColumnSourceMapping("Projection Mapping");
    int nextOutputColumn = 0;
    final int bigTableRetainedSize = testDesc.bigTableRetainColumnNums.length;
    for (int i = 0; i < bigTableRetainedSize; i++) {
        final int batchColumnIndex = testDesc.bigTableRetainColumnNums[i];
        TypeInfo typeInfo = testDesc.bigTableTypeInfos[i];
        projectionMapping.add(nextOutputColumn, batchColumnIndex, typeInfo);
        // Collect columns we copy from the big table batch to the overflow batch.
        if (!bigTableRetainMapping.containsOutputColumn(batchColumnIndex)) {
            // Tolerate repeated use of a big table column.
            bigTableRetainMapping.add(batchColumnIndex, batchColumnIndex, typeInfo);
        }
        nextOutputColumn++;
    }
    boolean isOuterJoin = (testDesc.vectorMapJoinVariation == VectorMapJoinVariation.OUTER || testDesc.vectorMapJoinVariation == VectorMapJoinVariation.FULL_OUTER);
    int emulateScratchColumn = testDesc.bigTableTypeInfos.length;
    VectorColumnOutputMapping smallTableKeyOutputMapping = new VectorColumnOutputMapping("Small Table Key Output Mapping");
    final int smallTableKeyRetainSize = testDesc.smallTableRetainKeyColumnNums.length;
    for (int i = 0; i < testDesc.smallTableRetainKeyColumnNums.length; i++) {
        final int smallTableKeyColumnNum = testDesc.smallTableRetainKeyColumnNums[i];
        final int bigTableKeyColumnNum = testDesc.bigTableKeyColumnNums[smallTableKeyColumnNum];
        TypeInfo keyTypeInfo = testDesc.smallTableKeyTypeInfos[smallTableKeyColumnNum];
        if (!isOuterJoin) {
            // Project the big table key into the small table result "area".
            projectionMapping.add(nextOutputColumn, bigTableKeyColumnNum, keyTypeInfo);
            if (!bigTableRetainMapping.containsOutputColumn(bigTableKeyColumnNum)) {
                nonOuterSmallTableKeyMapping.add(bigTableKeyColumnNum, bigTableKeyColumnNum, keyTypeInfo);
            }
        } else {
            outerSmallTableKeyMapping.add(bigTableKeyColumnNum, emulateScratchColumn, keyTypeInfo);
            projectionMapping.add(nextOutputColumn, emulateScratchColumn, keyTypeInfo);
            // For FULL OUTER MapJoin, we need to be able to deserialize a Small Table key
            // into the output result.
            fullOuterSmallTableKeyMapping.add(smallTableKeyColumnNum, emulateScratchColumn, keyTypeInfo);
            emulateScratchColumn++;
        }
        nextOutputColumn++;
    }
    // The order of the fields in the LazyBinary small table value must be used, so
    // we use the source ordering flavor for the mapping.
    VectorColumnSourceMapping smallTableValueMapping = new VectorColumnSourceMapping("Small Table Value Mapping");
    for (int i = 0; i < testDesc.smallTableValueTypeInfos.length; i++) {
        smallTableValueMapping.add(i, emulateScratchColumn, testDesc.smallTableValueTypeInfos[i]);
        projectionMapping.add(nextOutputColumn, emulateScratchColumn, testDesc.smallTableValueTypeInfos[i]);
        emulateScratchColumn++;
        nextOutputColumn++;
    }
    // Convert dynamic arrays and maps to simple arrays.
    bigTableRetainMapping.finalize();
    vectorMapJoinInfo.setBigTableRetainColumnMap(bigTableRetainMapping.getOutputColumns());
    vectorMapJoinInfo.setBigTableRetainTypeInfos(bigTableRetainMapping.getTypeInfos());
    nonOuterSmallTableKeyMapping.finalize();
    vectorMapJoinInfo.setNonOuterSmallTableKeyColumnMap(nonOuterSmallTableKeyMapping.getOutputColumns());
    vectorMapJoinInfo.setNonOuterSmallTableKeyTypeInfos(nonOuterSmallTableKeyMapping.getTypeInfos());
    outerSmallTableKeyMapping.finalize();
    fullOuterSmallTableKeyMapping.finalize();
    vectorMapJoinInfo.setOuterSmallTableKeyMapping(outerSmallTableKeyMapping);
    vectorMapJoinInfo.setFullOuterSmallTableKeyMapping(fullOuterSmallTableKeyMapping);
    smallTableValueMapping.finalize();
    vectorMapJoinInfo.setSmallTableValueMapping(smallTableValueMapping);
    projectionMapping.finalize();
    // Verify we added an entry for each output.
    assert projectionMapping.isSourceSequenceGood();
    vectorMapJoinInfo.setProjectionMapping(projectionMapping);
    if (projectionMapping.getCount() != testDesc.outputColumnNames.length) {
        throw new RuntimeException();
    }
    ;
    vectorDesc.setVectorMapJoinInfo(vectorMapJoinInfo);
    return vectorDesc;
}

Also used : HashTableKeyType(org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKeyType) VectorMapJoinDesc(org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc) HashTableKind(org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKind) VectorMapJoinInfo(org.apache.hadoop.hive.ql.plan.VectorMapJoinInfo) VectorColumnOutputMapping(org.apache.hadoop.hive.ql.exec.vector.VectorColumnOutputMapping) PrimitiveTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) PrimitiveTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo) VectorColumnSourceMapping(org.apache.hadoop.hive.ql.exec.vector.VectorColumnSourceMapping)

Aggregations

PrimitiveTypeInfo (org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo)6 TypeInfo (org.apache.hadoop.hive.serde2.typeinfo.TypeInfo)6 ArrayList (java.util.ArrayList)3 VectorColumnOutputMapping (org.apache.hadoop.hive.ql.exec.vector.VectorColumnOutputMapping)3 VectorColumnSourceMapping (org.apache.hadoop.hive.ql.exec.vector.VectorColumnSourceMapping)3 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)3 ByteArrayRef (org.apache.hadoop.hive.serde2.lazy.ByteArrayRef)3 StructTypeInfo (org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo)3 BytesWritable (org.apache.hadoop.io.BytesWritable)3 HashSet (java.util.HashSet)2 List (java.util.List)2 VectorMapJoinOperator (org.apache.hadoop.hive.ql.exec.vector.VectorMapJoinOperator)2 VectorExpression (org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression)2 VectorMapJoinDesc (org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc)2 VectorMapJoinInfo (org.apache.hadoop.hive.ql.plan.VectorMapJoinInfo)2 UDFToByte (org.apache.hadoop.hive.ql.udf.UDFToByte)2 UDFToInteger (org.apache.hadoop.hive.ql.udf.UDFToInteger)2 AbstractSerDe (org.apache.hadoop.hive.serde2.AbstractSerDe)2 Output (org.apache.hadoop.hive.serde2.ByteStream.Output)2 SerdeRandomRowSource (org.apache.hadoop.hive.serde2.SerdeRandomRowSource)2