Search in sources :

Example 56 with Deserializer

use of org.apache.hadoop.hive.serde2.Deserializer in project hive by apache.

the class Vectorizer method canSpecializeReduceSink.

private boolean canSpecializeReduceSink(ReduceSinkDesc desc, boolean isTezOrSpark, VectorizationContext vContext, VectorReduceSinkDesc vectorDesc) throws HiveException {
    VectorReduceSinkInfo vectorReduceSinkInfo = new VectorReduceSinkInfo();
    // Various restrictions.
    // Set this if we encounter a condition we were not expecting.
    boolean isUnexpectedCondition = false;
    boolean isVectorizationReduceSinkNativeEnabled = HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_VECTORIZATION_REDUCESINK_NEW_ENABLED);
    String engine = HiveConf.getVar(hiveConf, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE);
    int limit = desc.getTopN();
    float memUsage = desc.getTopNMemoryUsage();
    boolean hasPTFTopN = (limit >= 0 && memUsage > 0 && desc.isPTFReduceSink());
    boolean hasDistinctColumns = (desc.getDistinctColumnIndices().size() > 0);
    TableDesc keyTableDesc = desc.getKeySerializeInfo();
    Class<? extends Deserializer> keySerializerClass = keyTableDesc.getSerDeClass();
    boolean isKeyBinarySortable = (keySerializerClass == org.apache.hadoop.hive.serde2.binarysortable.BinarySortableSerDe.class);
    TableDesc valueTableDesc = desc.getValueSerializeInfo();
    Class<? extends Deserializer> valueDeserializerClass = valueTableDesc.getSerDeClass();
    boolean isValueLazyBinary = (valueDeserializerClass == org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe.class) || (valueDeserializerClass == org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe2.class);
    // We are doing work here we'd normally do in VectorGroupByCommonOperator's constructor.
    // So if we later decide not to specialize, we'll just waste any scratch columns allocated...
    List<ExprNodeDesc> keysDescs = desc.getKeyCols();
    final boolean isEmptyKey = (keysDescs.size() == 0);
    if (!isEmptyKey) {
        VectorExpression[] allKeyExpressions = vContext.getVectorExpressions(keysDescs);
        final int[] reduceSinkKeyColumnMap = new int[allKeyExpressions.length];
        final TypeInfo[] reduceSinkKeyTypeInfos = new TypeInfo[allKeyExpressions.length];
        final Type[] reduceSinkKeyColumnVectorTypes = new Type[allKeyExpressions.length];
        final VectorExpression[] reduceSinkKeyExpressions;
        // Since a key expression can be a calculation and the key will go into a scratch column,
        // we need the mapping and type information.
        ArrayList<VectorExpression> groupByKeyExpressionsList = new ArrayList<VectorExpression>();
        for (int i = 0; i < reduceSinkKeyColumnMap.length; i++) {
            VectorExpression ve = allKeyExpressions[i];
            reduceSinkKeyColumnMap[i] = ve.getOutputColumnNum();
            reduceSinkKeyTypeInfos[i] = keysDescs.get(i).getTypeInfo();
            reduceSinkKeyColumnVectorTypes[i] = VectorizationContext.getColumnVectorTypeFromTypeInfo(reduceSinkKeyTypeInfos[i]);
            if (!IdentityExpression.isColumnOnly(ve)) {
                groupByKeyExpressionsList.add(ve);
            }
        }
        if (groupByKeyExpressionsList.size() == 0) {
            reduceSinkKeyExpressions = null;
        } else {
            reduceSinkKeyExpressions = groupByKeyExpressionsList.toArray(new VectorExpression[0]);
        }
        vectorReduceSinkInfo.setReduceSinkKeyColumnMap(reduceSinkKeyColumnMap);
        vectorReduceSinkInfo.setReduceSinkKeyTypeInfos(reduceSinkKeyTypeInfos);
        vectorReduceSinkInfo.setReduceSinkKeyColumnVectorTypes(reduceSinkKeyColumnVectorTypes);
        vectorReduceSinkInfo.setReduceSinkKeyExpressions(reduceSinkKeyExpressions);
    }
    List<ExprNodeDesc> valueDescs = desc.getValueCols();
    final boolean isEmptyValue = (valueDescs.size() == 0);
    if (!isEmptyValue) {
        VectorExpression[] allValueExpressions = vContext.getVectorExpressions(valueDescs);
        final int[] reduceSinkValueColumnMap = new int[allValueExpressions.length];
        final TypeInfo[] reduceSinkValueTypeInfos = new TypeInfo[allValueExpressions.length];
        final Type[] reduceSinkValueColumnVectorTypes = new Type[allValueExpressions.length];
        VectorExpression[] reduceSinkValueExpressions;
        ArrayList<VectorExpression> reduceSinkValueExpressionsList = new ArrayList<VectorExpression>();
        for (int i = 0; i < valueDescs.size(); ++i) {
            VectorExpression ve = allValueExpressions[i];
            reduceSinkValueColumnMap[i] = ve.getOutputColumnNum();
            reduceSinkValueTypeInfos[i] = valueDescs.get(i).getTypeInfo();
            reduceSinkValueColumnVectorTypes[i] = VectorizationContext.getColumnVectorTypeFromTypeInfo(reduceSinkValueTypeInfos[i]);
            if (!IdentityExpression.isColumnOnly(ve)) {
                reduceSinkValueExpressionsList.add(ve);
            }
        }
        if (reduceSinkValueExpressionsList.size() == 0) {
            reduceSinkValueExpressions = null;
        } else {
            reduceSinkValueExpressions = reduceSinkValueExpressionsList.toArray(new VectorExpression[0]);
        }
        vectorReduceSinkInfo.setReduceSinkValueColumnMap(reduceSinkValueColumnMap);
        vectorReduceSinkInfo.setReduceSinkValueTypeInfos(reduceSinkValueTypeInfos);
        vectorReduceSinkInfo.setReduceSinkValueColumnVectorTypes(reduceSinkValueColumnVectorTypes);
        vectorReduceSinkInfo.setReduceSinkValueExpressions(reduceSinkValueExpressions);
    }
    boolean useUniformHash = desc.getReducerTraits().contains(UNIFORM);
    vectorReduceSinkInfo.setUseUniformHash(useUniformHash);
    List<ExprNodeDesc> bucketDescs = desc.getBucketCols();
    final boolean isEmptyBuckets = (bucketDescs == null || bucketDescs.size() == 0);
    List<ExprNodeDesc> partitionDescs = desc.getPartitionCols();
    final boolean isEmptyPartitions = (partitionDescs == null || partitionDescs.size() == 0);
    if (useUniformHash || (isEmptyKey && isEmptyBuckets && isEmptyPartitions)) {
    // NOTE: For Uniform Hash or no buckets/partitions, when the key is empty, we will use the VectorReduceSinkEmptyKeyOperator instead.
    } else {
        // Collect bucket and/or partition information for object hashing.
        int[] reduceSinkBucketColumnMap = null;
        TypeInfo[] reduceSinkBucketTypeInfos = null;
        Type[] reduceSinkBucketColumnVectorTypes = null;
        VectorExpression[] reduceSinkBucketExpressions = null;
        if (!isEmptyBuckets) {
            VectorExpression[] allBucketExpressions = vContext.getVectorExpressions(bucketDescs);
            reduceSinkBucketColumnMap = new int[bucketDescs.size()];
            reduceSinkBucketTypeInfos = new TypeInfo[bucketDescs.size()];
            reduceSinkBucketColumnVectorTypes = new Type[bucketDescs.size()];
            ArrayList<VectorExpression> reduceSinkBucketExpressionsList = new ArrayList<VectorExpression>();
            for (int i = 0; i < bucketDescs.size(); ++i) {
                VectorExpression ve = allBucketExpressions[i];
                reduceSinkBucketColumnMap[i] = ve.getOutputColumnNum();
                reduceSinkBucketTypeInfos[i] = bucketDescs.get(i).getTypeInfo();
                reduceSinkBucketColumnVectorTypes[i] = VectorizationContext.getColumnVectorTypeFromTypeInfo(reduceSinkBucketTypeInfos[i]);
                if (!IdentityExpression.isColumnOnly(ve)) {
                    reduceSinkBucketExpressionsList.add(ve);
                }
            }
            if (reduceSinkBucketExpressionsList.size() == 0) {
                reduceSinkBucketExpressions = null;
            } else {
                reduceSinkBucketExpressions = reduceSinkBucketExpressionsList.toArray(new VectorExpression[0]);
            }
        }
        int[] reduceSinkPartitionColumnMap = null;
        TypeInfo[] reduceSinkPartitionTypeInfos = null;
        Type[] reduceSinkPartitionColumnVectorTypes = null;
        VectorExpression[] reduceSinkPartitionExpressions = null;
        if (!isEmptyPartitions) {
            VectorExpression[] allPartitionExpressions = vContext.getVectorExpressions(partitionDescs);
            reduceSinkPartitionColumnMap = new int[partitionDescs.size()];
            reduceSinkPartitionTypeInfos = new TypeInfo[partitionDescs.size()];
            reduceSinkPartitionColumnVectorTypes = new Type[partitionDescs.size()];
            ArrayList<VectorExpression> reduceSinkPartitionExpressionsList = new ArrayList<VectorExpression>();
            for (int i = 0; i < partitionDescs.size(); ++i) {
                VectorExpression ve = allPartitionExpressions[i];
                reduceSinkPartitionColumnMap[i] = ve.getOutputColumnNum();
                reduceSinkPartitionTypeInfos[i] = partitionDescs.get(i).getTypeInfo();
                reduceSinkPartitionColumnVectorTypes[i] = VectorizationContext.getColumnVectorTypeFromTypeInfo(reduceSinkPartitionTypeInfos[i]);
                if (!IdentityExpression.isColumnOnly(ve)) {
                    reduceSinkPartitionExpressionsList.add(ve);
                }
            }
            if (reduceSinkPartitionExpressionsList.size() == 0) {
                reduceSinkPartitionExpressions = null;
            } else {
                reduceSinkPartitionExpressions = reduceSinkPartitionExpressionsList.toArray(new VectorExpression[0]);
            }
        }
        vectorReduceSinkInfo.setReduceSinkBucketColumnMap(reduceSinkBucketColumnMap);
        vectorReduceSinkInfo.setReduceSinkBucketTypeInfos(reduceSinkBucketTypeInfos);
        vectorReduceSinkInfo.setReduceSinkBucketColumnVectorTypes(reduceSinkBucketColumnVectorTypes);
        vectorReduceSinkInfo.setReduceSinkBucketExpressions(reduceSinkBucketExpressions);
        vectorReduceSinkInfo.setReduceSinkPartitionColumnMap(reduceSinkPartitionColumnMap);
        vectorReduceSinkInfo.setReduceSinkPartitionTypeInfos(reduceSinkPartitionTypeInfos);
        vectorReduceSinkInfo.setReduceSinkPartitionColumnVectorTypes(reduceSinkPartitionColumnVectorTypes);
        vectorReduceSinkInfo.setReduceSinkPartitionExpressions(reduceSinkPartitionExpressions);
    }
    // Remember the condition variables for EXPLAIN regardless.
    vectorDesc.setVectorReduceSinkInfo(vectorReduceSinkInfo);
    vectorDesc.setIsVectorizationReduceSinkNativeEnabled(isVectorizationReduceSinkNativeEnabled);
    vectorDesc.setEngine(engine);
    vectorDesc.setIsEmptyKey(isEmptyKey);
    vectorDesc.setIsEmptyValue(isEmptyValue);
    vectorDesc.setIsEmptyBuckets(isEmptyBuckets);
    vectorDesc.setIsEmptyPartitions(isEmptyPartitions);
    vectorDesc.setHasPTFTopN(hasPTFTopN);
    vectorDesc.setHasDistinctColumns(hasDistinctColumns);
    vectorDesc.setIsKeyBinarySortable(isKeyBinarySortable);
    vectorDesc.setIsValueLazyBinary(isValueLazyBinary);
    vectorDesc.setIsAcidChange(desc.getWriteType() == AcidUtils.Operation.DELETE || desc.getWriteType() == AcidUtils.Operation.UPDATE);
    // This indicates we logged an inconsistency (from our point-of-view) and will not make this
    // operator native...
    vectorDesc.setIsUnexpectedCondition(isUnexpectedCondition);
    // Many restrictions.
    if (!isVectorizationReduceSinkNativeEnabled || !isTezOrSpark || hasPTFTopN || hasDistinctColumns || !isKeyBinarySortable || !isValueLazyBinary || isUnexpectedCondition) {
        return false;
    }
    return true;
}
Also used : ArrayList(java.util.ArrayList) VectorReduceSinkInfo(org.apache.hadoop.hive.ql.plan.VectorReduceSinkInfo) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) MapTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo) ListTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) StructTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo) DecimalTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo) PrimitiveTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo) InConstantType(org.apache.hadoop.hive.ql.exec.vector.VectorizationContext.InConstantType) HashTableImplementationType(org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableImplementationType) HashTableKeyType(org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKeyType) Type(org.apache.hadoop.hive.ql.exec.vector.ColumnVector.Type) VectorDeserializeType(org.apache.hadoop.hive.ql.plan.VectorPartitionDesc.VectorDeserializeType) SupportedFunctionType(org.apache.hadoop.hive.ql.plan.VectorPTFDesc.SupportedFunctionType) OperatorType(org.apache.hadoop.hive.ql.plan.api.OperatorType) WindowType(org.apache.hadoop.hive.ql.parse.WindowingSpec.WindowType) VectorExpression(org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc)

Example 57 with Deserializer

use of org.apache.hadoop.hive.serde2.Deserializer in project hive by apache.

the class TestSerdeWithFieldComments method testFieldComments.

@Test
public void testFieldComments() throws MetaException, SerDeException {
    StructObjectInspector mockSOI = mock(StructObjectInspector.class);
    when(mockSOI.getCategory()).thenReturn(ObjectInspector.Category.STRUCT);
    List fieldRefs = new ArrayList<StructField>();
    // Add field with a comment...
    fieldRefs.add(mockedStructField("first", "type name 1", "this is a comment"));
    // ... and one without
    fieldRefs.add(mockedStructField("second", "type name 2", null));
    when(mockSOI.getAllStructFieldRefs()).thenReturn(fieldRefs);
    Deserializer mockDe = mock(Deserializer.class);
    when(mockDe.getObjectInspector()).thenReturn(mockSOI);
    List<FieldSchema> result = HiveMetaStoreUtils.getFieldsFromDeserializer("testTable", mockDe);
    assertEquals(2, result.size());
    assertEquals("first", result.get(0).getName());
    assertEquals("this is a comment", result.get(0).getComment());
    assertEquals("second", result.get(1).getName());
    assertEquals("from deserializer", result.get(1).getComment());
}
Also used : FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) ArrayList(java.util.ArrayList) List(java.util.List) ArrayList(java.util.ArrayList) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Test(org.junit.Test)

Example 58 with Deserializer

use of org.apache.hadoop.hive.serde2.Deserializer in project hive by apache.

the class TestDeserializer method testListDeserialize.

@Test
public void testListDeserialize() {
    Schema schema = new Schema(optional(1, "list_type", Types.ListType.ofOptional(2, Types.LongType.get())));
    StructObjectInspector inspector = ObjectInspectorFactory.getStandardStructObjectInspector(Arrays.asList("list_type"), Arrays.asList(ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableLongObjectInspector)));
    Deserializer deserializer = new Deserializer.Builder().schema(schema).writerInspector((StructObjectInspector) IcebergObjectInspector.create(schema)).sourceInspector(inspector).build();
    Record expected = GenericRecord.create(schema);
    expected.set(0, Collections.singletonList(1L));
    Object[] data = new Object[] { new Object[] { new LongWritable(1L) } };
    Record actual = deserializer.deserialize(data);
    Assert.assertEquals(expected, actual);
}
Also used : Schema(org.apache.iceberg.Schema) Record(org.apache.iceberg.data.Record) GenericRecord(org.apache.iceberg.data.GenericRecord) LongWritable(org.apache.hadoop.io.LongWritable) StandardStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Test(org.junit.Test)

Example 59 with Deserializer

use of org.apache.hadoop.hive.serde2.Deserializer in project hive by apache.

the class TestDeserializer method testSchemaDeserialize.

@Test
public void testSchemaDeserialize() {
    StandardStructObjectInspector schemaObjectInspector = ObjectInspectorFactory.getStandardStructObjectInspector(Arrays.asList("0:col1", "1:col2"), Arrays.asList(PrimitiveObjectInspectorFactory.writableLongObjectInspector, PrimitiveObjectInspectorFactory.writableStringObjectInspector));
    Deserializer deserializer = new Deserializer.Builder().schema(CUSTOMER_SCHEMA).writerInspector((StructObjectInspector) IcebergObjectInspector.create(CUSTOMER_SCHEMA)).sourceInspector(schemaObjectInspector).build();
    Record expected = GenericRecord.create(CUSTOMER_SCHEMA);
    expected.set(0, 1L);
    expected.set(1, "Bob");
    Record actual = deserializer.deserialize(new Object[] { new LongWritable(1L), new Text("Bob") });
    Assert.assertEquals(expected, actual);
}
Also used : StandardStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector) Record(org.apache.iceberg.data.Record) GenericRecord(org.apache.iceberg.data.GenericRecord) Text(org.apache.hadoop.io.Text) LongWritable(org.apache.hadoop.io.LongWritable) Test(org.junit.Test)

Example 60 with Deserializer

use of org.apache.hadoop.hive.serde2.Deserializer in project hive by apache.

the class TestAvroDeserializer method canDeserializeTimestamps.

/**
 * Test whether Avro timestamps can be deserialized according to new behavior (storage in UTC but
 * LocalDateTime semantics as timestamps are converted back to the writer time zone) as well as
 * old behavior (Instant semantics).
 */
@Test
public void canDeserializeTimestamps() throws SerDeException, IOException {
    List<String> columnNames = new ArrayList<>();
    columnNames.add("timestampField");
    List<TypeInfo> columnTypes = new ArrayList<>();
    columnTypes.add(TypeInfoFactory.getPrimitiveTypeInfo("timestamp"));
    Schema readerSchema = AvroSerdeUtils.getSchemaFor(TestAvroObjectInspectorGenerator.TIMESTAMP_SCHEMA);
    // 2019-01-02 00:00:00 GMT is 1546387200000 milliseconds after epoch
    GenericData.Record record = new GenericData.Record(readerSchema);
    record.put("timestampField", 1546387200999L);
    assertTrue(GENERIC_DATA.validate(readerSchema, record));
    AvroGenericRecordWritable agrw = new AvroGenericRecordWritable(ZoneId.of("America/New_York"), false, false);
    agrw.setRecord(record);
    agrw.setFileSchema(readerSchema);
    agrw.setRecordReaderID(new UID());
    AvroDeserializer deserializer = new AvroDeserializer();
    ArrayList<Object> row = (ArrayList<Object>) deserializer.deserialize(columnNames, columnTypes, agrw, readerSchema);
    Timestamp resultTimestamp = (Timestamp) row.get(0);
    // 2019-01-02 00:00:00 GMT is 2019-01-01 19:00:00 GMT-0500 (America/New_York / EST)
    assertEquals(Timestamp.valueOf("2019-01-01 19:00:00.999"), resultTimestamp);
    // Do the same without specifying writer time zone. This tests deserialization of older records
    // which should be interpreted in Instant semantics
    AvroGenericRecordWritable agrw2 = new AvroGenericRecordWritable();
    agrw2.setRecord(record);
    agrw2.setFileSchema(readerSchema);
    agrw2.setRecordReaderID(new UID());
    row = (ArrayList<Object>) deserializer.deserialize(columnNames, columnTypes, agrw2, readerSchema);
    resultTimestamp = (Timestamp) row.get(0);
    // 2019-01-02 00:00:00 GMT is 2019-01-01 16:00:00 in zone GMT-0800 (PST)
    // This is the time zone for VM in test.
    assertEquals(Timestamp.valueOf("2019-01-01 16:00:00.999"), resultTimestamp);
}
Also used : Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) GenericData(org.apache.avro.generic.GenericData) Timestamp(org.apache.hadoop.hive.common.type.Timestamp) UID(java.rmi.server.UID) Test(org.junit.Test)

Aggregations

Deserializer (org.apache.hadoop.hive.serde2.Deserializer)27 ArrayList (java.util.ArrayList)25 SerDeException (org.apache.hadoop.hive.serde2.SerDeException)20 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)19 MetaException (org.apache.hadoop.hive.metastore.api.MetaException)18 IOException (java.io.IOException)16 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)15 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)14 Properties (java.util.Properties)12 Path (org.apache.hadoop.fs.Path)11 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)10 TypeInfo (org.apache.hadoop.hive.serde2.typeinfo.TypeInfo)10 SQLCheckConstraint (org.apache.hadoop.hive.metastore.api.SQLCheckConstraint)8 SQLDefaultConstraint (org.apache.hadoop.hive.metastore.api.SQLDefaultConstraint)8 SQLNotNullConstraint (org.apache.hadoop.hive.metastore.api.SQLNotNullConstraint)8 SQLUniqueConstraint (org.apache.hadoop.hive.metastore.api.SQLUniqueConstraint)8 DefaultConstraint (org.apache.hadoop.hive.ql.metadata.DefaultConstraint)8 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)8 HashMap (java.util.HashMap)7 List (java.util.List)7