use of org.apache.hadoop.hive.serde2.Deserializer in project hive by apache.
the class Vectorizer method canSpecializeReduceSink.
private boolean canSpecializeReduceSink(ReduceSinkDesc desc, boolean isTezOrSpark, VectorizationContext vContext, VectorReduceSinkDesc vectorDesc) throws HiveException {
VectorReduceSinkInfo vectorReduceSinkInfo = new VectorReduceSinkInfo();
// Various restrictions.
// Set this if we encounter a condition we were not expecting.
boolean isUnexpectedCondition = false;
boolean isVectorizationReduceSinkNativeEnabled = HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_VECTORIZATION_REDUCESINK_NEW_ENABLED);
String engine = HiveConf.getVar(hiveConf, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE);
int limit = desc.getTopN();
float memUsage = desc.getTopNMemoryUsage();
boolean hasPTFTopN = (limit >= 0 && memUsage > 0 && desc.isPTFReduceSink());
boolean hasDistinctColumns = (desc.getDistinctColumnIndices().size() > 0);
TableDesc keyTableDesc = desc.getKeySerializeInfo();
Class<? extends Deserializer> keySerializerClass = keyTableDesc.getSerDeClass();
boolean isKeyBinarySortable = (keySerializerClass == org.apache.hadoop.hive.serde2.binarysortable.BinarySortableSerDe.class);
TableDesc valueTableDesc = desc.getValueSerializeInfo();
Class<? extends Deserializer> valueDeserializerClass = valueTableDesc.getSerDeClass();
boolean isValueLazyBinary = (valueDeserializerClass == org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe.class) || (valueDeserializerClass == org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe2.class);
// We are doing work here we'd normally do in VectorGroupByCommonOperator's constructor.
// So if we later decide not to specialize, we'll just waste any scratch columns allocated...
List<ExprNodeDesc> keysDescs = desc.getKeyCols();
final boolean isEmptyKey = (keysDescs.size() == 0);
if (!isEmptyKey) {
VectorExpression[] allKeyExpressions = vContext.getVectorExpressions(keysDescs);
final int[] reduceSinkKeyColumnMap = new int[allKeyExpressions.length];
final TypeInfo[] reduceSinkKeyTypeInfos = new TypeInfo[allKeyExpressions.length];
final Type[] reduceSinkKeyColumnVectorTypes = new Type[allKeyExpressions.length];
final VectorExpression[] reduceSinkKeyExpressions;
// Since a key expression can be a calculation and the key will go into a scratch column,
// we need the mapping and type information.
ArrayList<VectorExpression> groupByKeyExpressionsList = new ArrayList<VectorExpression>();
for (int i = 0; i < reduceSinkKeyColumnMap.length; i++) {
VectorExpression ve = allKeyExpressions[i];
reduceSinkKeyColumnMap[i] = ve.getOutputColumnNum();
reduceSinkKeyTypeInfos[i] = keysDescs.get(i).getTypeInfo();
reduceSinkKeyColumnVectorTypes[i] = VectorizationContext.getColumnVectorTypeFromTypeInfo(reduceSinkKeyTypeInfos[i]);
if (!IdentityExpression.isColumnOnly(ve)) {
groupByKeyExpressionsList.add(ve);
}
}
if (groupByKeyExpressionsList.size() == 0) {
reduceSinkKeyExpressions = null;
} else {
reduceSinkKeyExpressions = groupByKeyExpressionsList.toArray(new VectorExpression[0]);
}
vectorReduceSinkInfo.setReduceSinkKeyColumnMap(reduceSinkKeyColumnMap);
vectorReduceSinkInfo.setReduceSinkKeyTypeInfos(reduceSinkKeyTypeInfos);
vectorReduceSinkInfo.setReduceSinkKeyColumnVectorTypes(reduceSinkKeyColumnVectorTypes);
vectorReduceSinkInfo.setReduceSinkKeyExpressions(reduceSinkKeyExpressions);
}
List<ExprNodeDesc> valueDescs = desc.getValueCols();
final boolean isEmptyValue = (valueDescs.size() == 0);
if (!isEmptyValue) {
VectorExpression[] allValueExpressions = vContext.getVectorExpressions(valueDescs);
final int[] reduceSinkValueColumnMap = new int[allValueExpressions.length];
final TypeInfo[] reduceSinkValueTypeInfos = new TypeInfo[allValueExpressions.length];
final Type[] reduceSinkValueColumnVectorTypes = new Type[allValueExpressions.length];
VectorExpression[] reduceSinkValueExpressions;
ArrayList<VectorExpression> reduceSinkValueExpressionsList = new ArrayList<VectorExpression>();
for (int i = 0; i < valueDescs.size(); ++i) {
VectorExpression ve = allValueExpressions[i];
reduceSinkValueColumnMap[i] = ve.getOutputColumnNum();
reduceSinkValueTypeInfos[i] = valueDescs.get(i).getTypeInfo();
reduceSinkValueColumnVectorTypes[i] = VectorizationContext.getColumnVectorTypeFromTypeInfo(reduceSinkValueTypeInfos[i]);
if (!IdentityExpression.isColumnOnly(ve)) {
reduceSinkValueExpressionsList.add(ve);
}
}
if (reduceSinkValueExpressionsList.size() == 0) {
reduceSinkValueExpressions = null;
} else {
reduceSinkValueExpressions = reduceSinkValueExpressionsList.toArray(new VectorExpression[0]);
}
vectorReduceSinkInfo.setReduceSinkValueColumnMap(reduceSinkValueColumnMap);
vectorReduceSinkInfo.setReduceSinkValueTypeInfos(reduceSinkValueTypeInfos);
vectorReduceSinkInfo.setReduceSinkValueColumnVectorTypes(reduceSinkValueColumnVectorTypes);
vectorReduceSinkInfo.setReduceSinkValueExpressions(reduceSinkValueExpressions);
}
boolean useUniformHash = desc.getReducerTraits().contains(UNIFORM);
vectorReduceSinkInfo.setUseUniformHash(useUniformHash);
List<ExprNodeDesc> bucketDescs = desc.getBucketCols();
final boolean isEmptyBuckets = (bucketDescs == null || bucketDescs.size() == 0);
List<ExprNodeDesc> partitionDescs = desc.getPartitionCols();
final boolean isEmptyPartitions = (partitionDescs == null || partitionDescs.size() == 0);
if (useUniformHash || (isEmptyKey && isEmptyBuckets && isEmptyPartitions)) {
// NOTE: For Uniform Hash or no buckets/partitions, when the key is empty, we will use the VectorReduceSinkEmptyKeyOperator instead.
} else {
// Collect bucket and/or partition information for object hashing.
int[] reduceSinkBucketColumnMap = null;
TypeInfo[] reduceSinkBucketTypeInfos = null;
Type[] reduceSinkBucketColumnVectorTypes = null;
VectorExpression[] reduceSinkBucketExpressions = null;
if (!isEmptyBuckets) {
VectorExpression[] allBucketExpressions = vContext.getVectorExpressions(bucketDescs);
reduceSinkBucketColumnMap = new int[bucketDescs.size()];
reduceSinkBucketTypeInfos = new TypeInfo[bucketDescs.size()];
reduceSinkBucketColumnVectorTypes = new Type[bucketDescs.size()];
ArrayList<VectorExpression> reduceSinkBucketExpressionsList = new ArrayList<VectorExpression>();
for (int i = 0; i < bucketDescs.size(); ++i) {
VectorExpression ve = allBucketExpressions[i];
reduceSinkBucketColumnMap[i] = ve.getOutputColumnNum();
reduceSinkBucketTypeInfos[i] = bucketDescs.get(i).getTypeInfo();
reduceSinkBucketColumnVectorTypes[i] = VectorizationContext.getColumnVectorTypeFromTypeInfo(reduceSinkBucketTypeInfos[i]);
if (!IdentityExpression.isColumnOnly(ve)) {
reduceSinkBucketExpressionsList.add(ve);
}
}
if (reduceSinkBucketExpressionsList.size() == 0) {
reduceSinkBucketExpressions = null;
} else {
reduceSinkBucketExpressions = reduceSinkBucketExpressionsList.toArray(new VectorExpression[0]);
}
}
int[] reduceSinkPartitionColumnMap = null;
TypeInfo[] reduceSinkPartitionTypeInfos = null;
Type[] reduceSinkPartitionColumnVectorTypes = null;
VectorExpression[] reduceSinkPartitionExpressions = null;
if (!isEmptyPartitions) {
VectorExpression[] allPartitionExpressions = vContext.getVectorExpressions(partitionDescs);
reduceSinkPartitionColumnMap = new int[partitionDescs.size()];
reduceSinkPartitionTypeInfos = new TypeInfo[partitionDescs.size()];
reduceSinkPartitionColumnVectorTypes = new Type[partitionDescs.size()];
ArrayList<VectorExpression> reduceSinkPartitionExpressionsList = new ArrayList<VectorExpression>();
for (int i = 0; i < partitionDescs.size(); ++i) {
VectorExpression ve = allPartitionExpressions[i];
reduceSinkPartitionColumnMap[i] = ve.getOutputColumnNum();
reduceSinkPartitionTypeInfos[i] = partitionDescs.get(i).getTypeInfo();
reduceSinkPartitionColumnVectorTypes[i] = VectorizationContext.getColumnVectorTypeFromTypeInfo(reduceSinkPartitionTypeInfos[i]);
if (!IdentityExpression.isColumnOnly(ve)) {
reduceSinkPartitionExpressionsList.add(ve);
}
}
if (reduceSinkPartitionExpressionsList.size() == 0) {
reduceSinkPartitionExpressions = null;
} else {
reduceSinkPartitionExpressions = reduceSinkPartitionExpressionsList.toArray(new VectorExpression[0]);
}
}
vectorReduceSinkInfo.setReduceSinkBucketColumnMap(reduceSinkBucketColumnMap);
vectorReduceSinkInfo.setReduceSinkBucketTypeInfos(reduceSinkBucketTypeInfos);
vectorReduceSinkInfo.setReduceSinkBucketColumnVectorTypes(reduceSinkBucketColumnVectorTypes);
vectorReduceSinkInfo.setReduceSinkBucketExpressions(reduceSinkBucketExpressions);
vectorReduceSinkInfo.setReduceSinkPartitionColumnMap(reduceSinkPartitionColumnMap);
vectorReduceSinkInfo.setReduceSinkPartitionTypeInfos(reduceSinkPartitionTypeInfos);
vectorReduceSinkInfo.setReduceSinkPartitionColumnVectorTypes(reduceSinkPartitionColumnVectorTypes);
vectorReduceSinkInfo.setReduceSinkPartitionExpressions(reduceSinkPartitionExpressions);
}
// Remember the condition variables for EXPLAIN regardless.
vectorDesc.setVectorReduceSinkInfo(vectorReduceSinkInfo);
vectorDesc.setIsVectorizationReduceSinkNativeEnabled(isVectorizationReduceSinkNativeEnabled);
vectorDesc.setEngine(engine);
vectorDesc.setIsEmptyKey(isEmptyKey);
vectorDesc.setIsEmptyValue(isEmptyValue);
vectorDesc.setIsEmptyBuckets(isEmptyBuckets);
vectorDesc.setIsEmptyPartitions(isEmptyPartitions);
vectorDesc.setHasPTFTopN(hasPTFTopN);
vectorDesc.setHasDistinctColumns(hasDistinctColumns);
vectorDesc.setIsKeyBinarySortable(isKeyBinarySortable);
vectorDesc.setIsValueLazyBinary(isValueLazyBinary);
vectorDesc.setIsAcidChange(desc.getWriteType() == AcidUtils.Operation.DELETE || desc.getWriteType() == AcidUtils.Operation.UPDATE);
// This indicates we logged an inconsistency (from our point-of-view) and will not make this
// operator native...
vectorDesc.setIsUnexpectedCondition(isUnexpectedCondition);
// Many restrictions.
if (!isVectorizationReduceSinkNativeEnabled || !isTezOrSpark || hasPTFTopN || hasDistinctColumns || !isKeyBinarySortable || !isValueLazyBinary || isUnexpectedCondition) {
return false;
}
return true;
}
use of org.apache.hadoop.hive.serde2.Deserializer in project hive by apache.
the class TestSerdeWithFieldComments method testFieldComments.
@Test
public void testFieldComments() throws MetaException, SerDeException {
StructObjectInspector mockSOI = mock(StructObjectInspector.class);
when(mockSOI.getCategory()).thenReturn(ObjectInspector.Category.STRUCT);
List fieldRefs = new ArrayList<StructField>();
// Add field with a comment...
fieldRefs.add(mockedStructField("first", "type name 1", "this is a comment"));
// ... and one without
fieldRefs.add(mockedStructField("second", "type name 2", null));
when(mockSOI.getAllStructFieldRefs()).thenReturn(fieldRefs);
Deserializer mockDe = mock(Deserializer.class);
when(mockDe.getObjectInspector()).thenReturn(mockSOI);
List<FieldSchema> result = HiveMetaStoreUtils.getFieldsFromDeserializer("testTable", mockDe);
assertEquals(2, result.size());
assertEquals("first", result.get(0).getName());
assertEquals("this is a comment", result.get(0).getComment());
assertEquals("second", result.get(1).getName());
assertEquals("from deserializer", result.get(1).getComment());
}
use of org.apache.hadoop.hive.serde2.Deserializer in project hive by apache.
the class TestDeserializer method testListDeserialize.
@Test
public void testListDeserialize() {
Schema schema = new Schema(optional(1, "list_type", Types.ListType.ofOptional(2, Types.LongType.get())));
StructObjectInspector inspector = ObjectInspectorFactory.getStandardStructObjectInspector(Arrays.asList("list_type"), Arrays.asList(ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableLongObjectInspector)));
Deserializer deserializer = new Deserializer.Builder().schema(schema).writerInspector((StructObjectInspector) IcebergObjectInspector.create(schema)).sourceInspector(inspector).build();
Record expected = GenericRecord.create(schema);
expected.set(0, Collections.singletonList(1L));
Object[] data = new Object[] { new Object[] { new LongWritable(1L) } };
Record actual = deserializer.deserialize(data);
Assert.assertEquals(expected, actual);
}
use of org.apache.hadoop.hive.serde2.Deserializer in project hive by apache.
the class TestDeserializer method testSchemaDeserialize.
@Test
public void testSchemaDeserialize() {
StandardStructObjectInspector schemaObjectInspector = ObjectInspectorFactory.getStandardStructObjectInspector(Arrays.asList("0:col1", "1:col2"), Arrays.asList(PrimitiveObjectInspectorFactory.writableLongObjectInspector, PrimitiveObjectInspectorFactory.writableStringObjectInspector));
Deserializer deserializer = new Deserializer.Builder().schema(CUSTOMER_SCHEMA).writerInspector((StructObjectInspector) IcebergObjectInspector.create(CUSTOMER_SCHEMA)).sourceInspector(schemaObjectInspector).build();
Record expected = GenericRecord.create(CUSTOMER_SCHEMA);
expected.set(0, 1L);
expected.set(1, "Bob");
Record actual = deserializer.deserialize(new Object[] { new LongWritable(1L), new Text("Bob") });
Assert.assertEquals(expected, actual);
}
use of org.apache.hadoop.hive.serde2.Deserializer in project hive by apache.
the class TestAvroDeserializer method canDeserializeTimestamps.
/**
* Test whether Avro timestamps can be deserialized according to new behavior (storage in UTC but
* LocalDateTime semantics as timestamps are converted back to the writer time zone) as well as
* old behavior (Instant semantics).
*/
@Test
public void canDeserializeTimestamps() throws SerDeException, IOException {
List<String> columnNames = new ArrayList<>();
columnNames.add("timestampField");
List<TypeInfo> columnTypes = new ArrayList<>();
columnTypes.add(TypeInfoFactory.getPrimitiveTypeInfo("timestamp"));
Schema readerSchema = AvroSerdeUtils.getSchemaFor(TestAvroObjectInspectorGenerator.TIMESTAMP_SCHEMA);
// 2019-01-02 00:00:00 GMT is 1546387200000 milliseconds after epoch
GenericData.Record record = new GenericData.Record(readerSchema);
record.put("timestampField", 1546387200999L);
assertTrue(GENERIC_DATA.validate(readerSchema, record));
AvroGenericRecordWritable agrw = new AvroGenericRecordWritable(ZoneId.of("America/New_York"), false, false);
agrw.setRecord(record);
agrw.setFileSchema(readerSchema);
agrw.setRecordReaderID(new UID());
AvroDeserializer deserializer = new AvroDeserializer();
ArrayList<Object> row = (ArrayList<Object>) deserializer.deserialize(columnNames, columnTypes, agrw, readerSchema);
Timestamp resultTimestamp = (Timestamp) row.get(0);
// 2019-01-02 00:00:00 GMT is 2019-01-01 19:00:00 GMT-0500 (America/New_York / EST)
assertEquals(Timestamp.valueOf("2019-01-01 19:00:00.999"), resultTimestamp);
// Do the same without specifying writer time zone. This tests deserialization of older records
// which should be interpreted in Instant semantics
AvroGenericRecordWritable agrw2 = new AvroGenericRecordWritable();
agrw2.setRecord(record);
agrw2.setFileSchema(readerSchema);
agrw2.setRecordReaderID(new UID());
row = (ArrayList<Object>) deserializer.deserialize(columnNames, columnTypes, agrw2, readerSchema);
resultTimestamp = (Timestamp) row.get(0);
// 2019-01-02 00:00:00 GMT is 2019-01-01 16:00:00 in zone GMT-0800 (PST)
// This is the time zone for VM in test.
assertEquals(Timestamp.valueOf("2019-01-01 16:00:00.999"), resultTimestamp);
}
Aggregations