use of org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo in project hive by apache.
the class VectorizedListColumnReader method readBatch.
@Override
public void readBatch(int total, ColumnVector column, TypeInfo columnType) throws IOException {
ListColumnVector lcv = (ListColumnVector) column;
// before readBatch, initial the size of offsets & lengths as the default value,
// the actual size will be assigned in setChildrenInfo() after reading complete.
lcv.offsets = new long[VectorizedRowBatch.DEFAULT_SIZE];
lcv.lengths = new long[VectorizedRowBatch.DEFAULT_SIZE];
// Because the length of ListColumnVector.child can't be known now,
// the valueList will save all data for ListColumnVector temporary.
List<Object> valueList = new ArrayList<>();
PrimitiveObjectInspector.PrimitiveCategory category = ((PrimitiveTypeInfo) ((ListTypeInfo) columnType).getListElementTypeInfo()).getPrimitiveCategory();
// read the first row in parquet data page, this will be only happened once for this instance
if (isFirstRow) {
if (!fetchNextValue(category)) {
return;
}
isFirstRow = false;
}
int index = collectDataFromParquetPage(total, lcv, valueList, category);
// Convert valueList to array for the ListColumnVector.child
convertValueListToListColumnVector(category, lcv, valueList, index);
}
use of org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo in project hive by apache.
the class VectorizedParquetRecordReader method buildVectorizedParquetReader.
// Build VectorizedParquetColumnReader via Hive typeInfo and Parquet schema
private VectorizedColumnReader buildVectorizedParquetReader(TypeInfo typeInfo, Type type, PageReadStore pages, List<ColumnDescriptor> columnDescriptors, boolean skipTimestampConversion, ZoneId writerTimezone, boolean skipProlepticConversion, boolean legacyConversionEnabled, int depth) throws IOException {
List<ColumnDescriptor> descriptors = getAllColumnDescriptorByType(depth, type, columnDescriptors);
switch(typeInfo.getCategory()) {
case PRIMITIVE:
if (columnDescriptors == null || columnDescriptors.isEmpty()) {
throw new RuntimeException("Failed to find related Parquet column descriptor with type " + type);
}
if (fileSchema.getColumns().contains(descriptors.get(0))) {
return new VectorizedPrimitiveColumnReader(descriptors.get(0), pages.getPageReader(descriptors.get(0)), skipTimestampConversion, writerTimezone, skipProlepticConversion, legacyConversionEnabled, type, typeInfo);
} else {
// Support for schema evolution
return new VectorizedDummyColumnReader();
}
case STRUCT:
StructTypeInfo structTypeInfo = (StructTypeInfo) typeInfo;
List<VectorizedColumnReader> fieldReaders = new ArrayList<>();
List<TypeInfo> fieldTypes = structTypeInfo.getAllStructFieldTypeInfos();
List<Type> types = type.asGroupType().getFields();
for (int i = 0; i < fieldTypes.size(); i++) {
VectorizedColumnReader r = buildVectorizedParquetReader(fieldTypes.get(i), types.get(i), pages, descriptors, skipTimestampConversion, writerTimezone, skipProlepticConversion, legacyConversionEnabled, depth + 1);
if (r != null) {
fieldReaders.add(r);
} else {
throw new RuntimeException("Fail to build Parquet vectorized reader based on Hive type " + fieldTypes.get(i).getTypeName() + " and Parquet type" + types.get(i).toString());
}
}
return new VectorizedStructColumnReader(fieldReaders);
case LIST:
checkListColumnSupport(((ListTypeInfo) typeInfo).getListElementTypeInfo());
if (columnDescriptors == null || columnDescriptors.isEmpty()) {
throw new RuntimeException("Failed to find related Parquet column descriptor with type " + type);
}
return new VectorizedListColumnReader(descriptors.get(0), pages.getPageReader(descriptors.get(0)), skipTimestampConversion, writerTimezone, skipProlepticConversion, legacyConversionEnabled, getElementType(type), typeInfo);
case MAP:
if (columnDescriptors == null || columnDescriptors.isEmpty()) {
throw new RuntimeException("Failed to find related Parquet column descriptor with type " + type);
}
// to handle the different Map definition in Parquet, eg:
// definition has 1 group:
// repeated group map (MAP_KEY_VALUE)
// {required binary key (UTF8); optional binary value (UTF8);}
// definition has 2 groups:
// optional group m1 (MAP) {
// repeated group map (MAP_KEY_VALUE)
// {required binary key (UTF8); optional binary value (UTF8);}
// }
int nestGroup = 0;
GroupType groupType = type.asGroupType();
// otherwise, continue to get the group type until MAP_DEFINITION_LEVEL_MAX.
while (groupType.getFieldCount() < 2) {
if (nestGroup > MAP_DEFINITION_LEVEL_MAX) {
throw new RuntimeException("More than " + MAP_DEFINITION_LEVEL_MAX + " level is found in Map definition, " + "Failed to get the field types for Map with type " + type);
}
groupType = groupType.getFields().get(0).asGroupType();
nestGroup++;
}
List<Type> kvTypes = groupType.getFields();
VectorizedListColumnReader keyListColumnReader = new VectorizedListColumnReader(descriptors.get(0), pages.getPageReader(descriptors.get(0)), skipTimestampConversion, writerTimezone, skipProlepticConversion, legacyConversionEnabled, kvTypes.get(0), typeInfo);
VectorizedListColumnReader valueListColumnReader = new VectorizedListColumnReader(descriptors.get(1), pages.getPageReader(descriptors.get(1)), skipTimestampConversion, writerTimezone, skipProlepticConversion, legacyConversionEnabled, kvTypes.get(1), typeInfo);
return new VectorizedMapColumnReader(keyListColumnReader, valueListColumnReader);
case UNION:
default:
throw new RuntimeException("Unsupported category " + typeInfo.getCategory().name());
}
}
use of org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo in project hive by apache.
the class VectorizedMapColumnReader method readBatch.
@Override
public void readBatch(int total, ColumnVector column, TypeInfo columnType) throws IOException {
MapColumnVector mapColumnVector = (MapColumnVector) column;
MapTypeInfo mapTypeInfo = (MapTypeInfo) columnType;
ListTypeInfo keyListTypeInfo = new ListTypeInfo();
keyListTypeInfo.setListElementTypeInfo(mapTypeInfo.getMapKeyTypeInfo());
ListTypeInfo valueListTypeInfo = new ListTypeInfo();
valueListTypeInfo.setListElementTypeInfo(mapTypeInfo.getMapValueTypeInfo());
// initialize 2 ListColumnVector for keys and values
ListColumnVector keyListColumnVector = new ListColumnVector();
ListColumnVector valueListColumnVector = new ListColumnVector();
// read the keys and values
keyColumnReader.readBatch(total, keyListColumnVector, keyListTypeInfo);
valueColumnReader.readBatch(total, valueListColumnVector, valueListTypeInfo);
// set the related attributes according to the keys and values
mapColumnVector.keys = keyListColumnVector.child;
mapColumnVector.values = valueListColumnVector.child;
mapColumnVector.isNull = keyListColumnVector.isNull;
mapColumnVector.offsets = keyListColumnVector.offsets;
mapColumnVector.lengths = keyListColumnVector.lengths;
mapColumnVector.childCount = keyListColumnVector.childCount;
mapColumnVector.isRepeating = keyListColumnVector.isRepeating && valueListColumnVector.isRepeating;
}
use of org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo in project hive by apache.
the class ParquetDataColumnReaderFactory method getDataColumnReaderByTypeHelper.
private static ParquetDataColumnReader getDataColumnReaderByTypeHelper(boolean isDictionary, PrimitiveType parquetType, TypeInfo hiveType, Dictionary dictionary, ValuesReader valuesReader, boolean skipTimestampConversion, ZoneId writerTimezone, boolean legacyConversionEnabled) throws IOException {
// max length for varchar and char cases
int length = getVarcharLength(hiveType);
TypeInfo realHiveType = (hiveType instanceof ListTypeInfo) ? ((ListTypeInfo) hiveType).getListElementTypeInfo() : hiveType;
String typeName = TypeInfoUtils.getBaseName(realHiveType.getTypeName());
int hivePrecision = (typeName.equalsIgnoreCase(serdeConstants.DECIMAL_TYPE_NAME)) ? ((DecimalTypeInfo) realHiveType).getPrecision() : 0;
int hiveScale = (typeName.equalsIgnoreCase(serdeConstants.DECIMAL_TYPE_NAME)) ? ((DecimalTypeInfo) realHiveType).getScale() : 0;
switch(parquetType.getPrimitiveTypeName()) {
case INT32:
if (ETypeConverter.isUnsignedInteger(parquetType)) {
return isDictionary ? new TypesFromUInt32PageReader(dictionary, length, hivePrecision, hiveScale) : new TypesFromUInt32PageReader(valuesReader, length, hivePrecision, hiveScale);
} else if (parquetType.getLogicalTypeAnnotation() instanceof DecimalLogicalTypeAnnotation) {
DecimalLogicalTypeAnnotation logicalType = (DecimalLogicalTypeAnnotation) parquetType.getLogicalTypeAnnotation();
final short scale = (short) logicalType.getScale();
return isDictionary ? new TypesFromInt32DecimalPageReader(dictionary, length, scale, hivePrecision, hiveScale) : new TypesFromInt32DecimalPageReader(valuesReader, length, scale, hivePrecision, hiveScale);
} else {
return isDictionary ? new TypesFromInt32PageReader(dictionary, length, hivePrecision, hiveScale) : new TypesFromInt32PageReader(valuesReader, length, hivePrecision, hiveScale);
}
case INT64:
LogicalTypeAnnotation logicalType = parquetType.getLogicalTypeAnnotation();
if (logicalType instanceof TimestampLogicalTypeAnnotation) {
TimestampLogicalTypeAnnotation timestampLogicalType = (TimestampLogicalTypeAnnotation) logicalType;
boolean isAdjustedToUTC = timestampLogicalType.isAdjustedToUTC();
TimeUnit timeUnit = timestampLogicalType.getUnit();
return isDictionary ? new TypesFromInt64PageReader(dictionary, length, isAdjustedToUTC, timeUnit) : new TypesFromInt64PageReader(valuesReader, length, isAdjustedToUTC, timeUnit);
}
if (ETypeConverter.isUnsignedInteger(parquetType)) {
return isDictionary ? new TypesFromUInt64PageReader(dictionary, length, hivePrecision, hiveScale) : new TypesFromUInt64PageReader(valuesReader, length, hivePrecision, hiveScale);
}
if (logicalType instanceof DecimalLogicalTypeAnnotation) {
DecimalLogicalTypeAnnotation decimalLogicalType = (DecimalLogicalTypeAnnotation) logicalType;
final short scale = (short) decimalLogicalType.getScale();
return isDictionary ? new TypesFromInt64DecimalPageReader(dictionary, length, scale, hivePrecision, hiveScale) : new TypesFromInt64DecimalPageReader(valuesReader, length, scale, hivePrecision, hiveScale);
}
return isDictionary ? new TypesFromInt64PageReader(dictionary, length, hivePrecision, hiveScale) : new TypesFromInt64PageReader(valuesReader, length, hivePrecision, hiveScale);
case FLOAT:
return isDictionary ? new TypesFromFloatPageReader(dictionary, length, hivePrecision, hiveScale) : new TypesFromFloatPageReader(valuesReader, length, hivePrecision, hiveScale);
case INT96:
ZoneId targetZone = skipTimestampConversion ? ZoneOffset.UTC : firstNonNull(writerTimezone, TimeZone.getDefault().toZoneId());
return isDictionary ? new TypesFromInt96PageReader(dictionary, length, targetZone, legacyConversionEnabled) : new TypesFromInt96PageReader(valuesReader, length, targetZone, legacyConversionEnabled);
case BOOLEAN:
return isDictionary ? new TypesFromBooleanPageReader(dictionary, length) : new TypesFromBooleanPageReader(valuesReader, length);
case BINARY:
case FIXED_LEN_BYTE_ARRAY:
return getConvertorFromBinary(isDictionary, parquetType, hiveType, valuesReader, dictionary);
case DOUBLE:
return isDictionary ? new TypesFromDoublePageReader(dictionary, length, hivePrecision, hiveScale) : new TypesFromDoublePageReader(valuesReader, length, hivePrecision, hiveScale);
default:
return isDictionary ? new DefaultParquetDataColumnReader(dictionary, length, hivePrecision, hiveScale) : new DefaultParquetDataColumnReader(valuesReader, length, hivePrecision, hiveScale);
}
}
use of org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo in project hive by apache.
the class TestHiveSchemaConverter method testArrayInArray.
@Test
public void testArrayInArray() throws Exception {
final List<String> columnNames = createHiveColumnsFrom("arrayCol");
ListTypeInfo listTypeInfo = new ListTypeInfo();
listTypeInfo.setListElementTypeInfo(TypeInfoUtils.getTypeInfosFromTypeString("int").get(0));
List<TypeInfo> typeInfos = new ArrayList<>();
ListTypeInfo listTypeInfo2 = new ListTypeInfo();
listTypeInfo2.setListElementTypeInfo(listTypeInfo);
typeInfos.add(listTypeInfo2);
final MessageType messageTypeFound = HiveSchemaConverter.convert(columnNames, typeInfos);
final MessageType expectedMT = MessageTypeParser.parseMessageType("message hive_schema {\n" + " optional group arrayCol (LIST) {\n" + " repeated group bag {\n" + " optional group array_element (LIST) {\n" + " repeated group bag {\n" + " optional int32 array_element;\n" + " }\n" + " }\n" + " }\n" + " }\n" + "}\n");
assertEquals(expectedMT, messageTypeFound);
}
Aggregations