Search in sources :

Example 1 with LogicalTypeAnnotation

use of org.apache.parquet.schema.LogicalTypeAnnotation in project flink by apache.

the class ParquetRowDataWriter method createWriter.

private FieldWriter createWriter(LogicalType t, Type type) {
    if (type.isPrimitive()) {
        switch(t.getTypeRoot()) {
            case CHAR:
            case VARCHAR:
                return new StringWriter();
            case BOOLEAN:
                return new BooleanWriter();
            case BINARY:
            case VARBINARY:
                return new BinaryWriter();
            case DECIMAL:
                DecimalType decimalType = (DecimalType) t;
                return createDecimalWriter(decimalType.getPrecision(), decimalType.getScale());
            case TINYINT:
                return new ByteWriter();
            case SMALLINT:
                return new ShortWriter();
            case DATE:
            case TIME_WITHOUT_TIME_ZONE:
            case INTEGER:
                return new IntWriter();
            case BIGINT:
                return new LongWriter();
            case FLOAT:
                return new FloatWriter();
            case DOUBLE:
                return new DoubleWriter();
            case TIMESTAMP_WITHOUT_TIME_ZONE:
                TimestampType timestampType = (TimestampType) t;
                return new TimestampWriter(timestampType.getPrecision());
            case TIMESTAMP_WITH_LOCAL_TIME_ZONE:
                LocalZonedTimestampType localZonedTimestampType = (LocalZonedTimestampType) t;
                return new TimestampWriter(localZonedTimestampType.getPrecision());
            default:
                throw new UnsupportedOperationException("Unsupported type: " + type);
        }
    } else {
        GroupType groupType = type.asGroupType();
        LogicalTypeAnnotation logicalType = type.getLogicalTypeAnnotation();
        if (t instanceof ArrayType && logicalType instanceof LogicalTypeAnnotation.ListLogicalTypeAnnotation) {
            return new ArrayWriter(((ArrayType) t).getElementType(), groupType);
        } else if (t instanceof MapType && logicalType instanceof LogicalTypeAnnotation.MapLogicalTypeAnnotation) {
            return new MapWriter(((MapType) t).getKeyType(), ((MapType) t).getValueType(), groupType);
        } else if (t instanceof RowType && type instanceof GroupType) {
            return new RowWriter((RowType) t, groupType);
        } else {
            throw new UnsupportedOperationException("Unsupported type: " + type);
        }
    }
}
Also used : LocalZonedTimestampType(org.apache.flink.table.types.logical.LocalZonedTimestampType) RowType(org.apache.flink.table.types.logical.RowType) MapType(org.apache.flink.table.types.logical.MapType) ArrayType(org.apache.flink.table.types.logical.ArrayType) GroupType(org.apache.parquet.schema.GroupType) LogicalTypeAnnotation(org.apache.parquet.schema.LogicalTypeAnnotation) DecimalType(org.apache.flink.table.types.logical.DecimalType) TimestampType(org.apache.flink.table.types.logical.TimestampType) LocalZonedTimestampType(org.apache.flink.table.types.logical.LocalZonedTimestampType)

Example 2 with LogicalTypeAnnotation

use of org.apache.parquet.schema.LogicalTypeAnnotation in project hive by apache.

the class ParquetDataColumnReaderFactory method getConvertorFromBinary.

private static ParquetDataColumnReader getConvertorFromBinary(boolean isDict, PrimitiveType parquetType, TypeInfo hiveType, ValuesReader valuesReader, Dictionary dictionary) {
    LogicalTypeAnnotation logicalType = parquetType.getLogicalTypeAnnotation();
    // max length for varchar and char cases
    int length = getVarcharLength(hiveType);
    TypeInfo realHiveType = (hiveType instanceof ListTypeInfo) ? ((ListTypeInfo) hiveType).getListElementTypeInfo() : (hiveType instanceof MapTypeInfo) ? ((MapTypeInfo) hiveType).getMapValueTypeInfo() : hiveType;
    String typeName = TypeInfoUtils.getBaseName(realHiveType.getTypeName());
    int hivePrecision = (typeName.equalsIgnoreCase(serdeConstants.DECIMAL_TYPE_NAME)) ? ((DecimalTypeInfo) realHiveType).getPrecision() : 0;
    int hiveScale = (typeName.equalsIgnoreCase(serdeConstants.DECIMAL_TYPE_NAME)) ? ((DecimalTypeInfo) realHiveType).getScale() : 0;
    if (logicalType == null) {
        return isDict ? new DefaultParquetDataColumnReader(dictionary, length) : new DefaultParquetDataColumnReader(valuesReader, length);
    }
    Optional<ParquetDataColumnReader> reader = parquetType.getLogicalTypeAnnotation().accept(new LogicalTypeAnnotationVisitor<ParquetDataColumnReader>() {

        @Override
        public Optional<ParquetDataColumnReader> visit(DecimalLogicalTypeAnnotation logicalTypeAnnotation) {
            final short scale = (short) logicalTypeAnnotation.getScale();
            return isDict ? Optional.of(new TypesFromDecimalPageReader(dictionary, length, scale, hivePrecision, hiveScale)) : Optional.of(new TypesFromDecimalPageReader(valuesReader, length, scale, hivePrecision, hiveScale));
        }

        @Override
        public Optional<ParquetDataColumnReader> visit(StringLogicalTypeAnnotation logicalTypeAnnotation) {
            return isDict ? Optional.of(new TypesFromStringPageReader(dictionary, length)) : Optional.of(new TypesFromStringPageReader(valuesReader, length));
        }
    });
    if (reader.isPresent()) {
        return reader.get();
    }
    return isDict ? new DefaultParquetDataColumnReader(dictionary, length) : new DefaultParquetDataColumnReader(valuesReader, length);
}
Also used : DecimalLogicalTypeAnnotation(org.apache.parquet.schema.LogicalTypeAnnotation.DecimalLogicalTypeAnnotation) Optional(java.util.Optional) StringLogicalTypeAnnotation(org.apache.parquet.schema.LogicalTypeAnnotation.StringLogicalTypeAnnotation) MapTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo) ListTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo) PrimitiveTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo) DecimalTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) VarcharTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo) CharTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo) StringLogicalTypeAnnotation(org.apache.parquet.schema.LogicalTypeAnnotation.StringLogicalTypeAnnotation) DecimalLogicalTypeAnnotation(org.apache.parquet.schema.LogicalTypeAnnotation.DecimalLogicalTypeAnnotation) LogicalTypeAnnotation(org.apache.parquet.schema.LogicalTypeAnnotation) TimestampLogicalTypeAnnotation(org.apache.parquet.schema.LogicalTypeAnnotation.TimestampLogicalTypeAnnotation) ListTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo) MapTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo)

Example 3 with LogicalTypeAnnotation

use of org.apache.parquet.schema.LogicalTypeAnnotation in project hive by apache.

the class HiveParquetSchemaTestUtils method testConversion.

public static void testConversion(final String columnNamesStr, final String columnsTypeStr, final String actualSchema, final Configuration conf) throws Exception {
    final List<String> columnNames = createHiveColumnsFrom(columnNamesStr);
    final List<TypeInfo> columnTypes = createHiveTypeInfoFrom(columnsTypeStr);
    final MessageType messageTypeFound = HiveSchemaConverter.convert(columnNames, columnTypes, conf);
    final MessageType expectedMT = MessageTypeParser.parseMessageType(actualSchema);
    assertEquals("converting " + columnNamesStr + ": " + columnsTypeStr + " to " + actualSchema, expectedMT, messageTypeFound);
    // Required to check the original types manually as PrimitiveType.equals does not care about it
    List<Type> expectedFields = expectedMT.getFields();
    List<Type> actualFields = messageTypeFound.getFields();
    for (int i = 0, n = expectedFields.size(); i < n; ++i) {
        LogicalTypeAnnotation expectedLogicalType = expectedFields.get(i).getLogicalTypeAnnotation();
        LogicalTypeAnnotation actualLogicalType = actualFields.get(i).getLogicalTypeAnnotation();
        assertEquals("Logical type annotations of the field do not match", expectedLogicalType, actualLogicalType);
    }
}
Also used : MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) LogicalTypeAnnotation(org.apache.parquet.schema.LogicalTypeAnnotation) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) MessageType(org.apache.parquet.schema.MessageType)

Example 4 with LogicalTypeAnnotation

use of org.apache.parquet.schema.LogicalTypeAnnotation in project hive by apache.

the class HiveParquetSchemaTestUtils method testLogicalTypeAnnotations.

public static void testLogicalTypeAnnotations(final String hiveColumnNames, final String hiveColumnTypes, final Map<String, LogicalTypeAnnotation> expectedLogicalTypes, Configuration conf) throws Exception {
    final List<String> columnNames = createHiveColumnsFrom(hiveColumnNames);
    final List<TypeInfo> columnTypes = createHiveTypeInfoFrom(hiveColumnTypes);
    final MessageType messageTypeFound = HiveSchemaConverter.convert(columnNames, columnTypes, conf);
    List<Type> actualFields = messageTypeFound.getFields();
    for (Type actualField : actualFields) {
        LogicalTypeAnnotation expectedLogicalType = expectedLogicalTypes.get(actualField.getName());
        LogicalTypeAnnotation actualLogicalType = actualField.getLogicalTypeAnnotation();
        if (expectedLogicalType != null) {
            assertNotNull("The logical type annotation cannot be null.", actualLogicalType);
            assertEquals("Logical type annotations of the field do not match", expectedLogicalType, actualLogicalType);
        } else {
            assertNull("The logical type annotation must be null.", actualLogicalType);
        }
    }
}
Also used : MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) LogicalTypeAnnotation(org.apache.parquet.schema.LogicalTypeAnnotation) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) MessageType(org.apache.parquet.schema.MessageType)

Example 5 with LogicalTypeAnnotation

use of org.apache.parquet.schema.LogicalTypeAnnotation in project hive by apache.

the class ParquetDataColumnReaderFactory method getDataColumnReaderByTypeHelper.

private static ParquetDataColumnReader getDataColumnReaderByTypeHelper(boolean isDictionary, PrimitiveType parquetType, TypeInfo hiveType, Dictionary dictionary, ValuesReader valuesReader, boolean skipTimestampConversion, ZoneId writerTimezone, boolean legacyConversionEnabled) throws IOException {
    // max length for varchar and char cases
    int length = getVarcharLength(hiveType);
    TypeInfo realHiveType = (hiveType instanceof ListTypeInfo) ? ((ListTypeInfo) hiveType).getListElementTypeInfo() : hiveType;
    String typeName = TypeInfoUtils.getBaseName(realHiveType.getTypeName());
    int hivePrecision = (typeName.equalsIgnoreCase(serdeConstants.DECIMAL_TYPE_NAME)) ? ((DecimalTypeInfo) realHiveType).getPrecision() : 0;
    int hiveScale = (typeName.equalsIgnoreCase(serdeConstants.DECIMAL_TYPE_NAME)) ? ((DecimalTypeInfo) realHiveType).getScale() : 0;
    switch(parquetType.getPrimitiveTypeName()) {
        case INT32:
            if (ETypeConverter.isUnsignedInteger(parquetType)) {
                return isDictionary ? new TypesFromUInt32PageReader(dictionary, length, hivePrecision, hiveScale) : new TypesFromUInt32PageReader(valuesReader, length, hivePrecision, hiveScale);
            } else if (parquetType.getLogicalTypeAnnotation() instanceof DecimalLogicalTypeAnnotation) {
                DecimalLogicalTypeAnnotation logicalType = (DecimalLogicalTypeAnnotation) parquetType.getLogicalTypeAnnotation();
                final short scale = (short) logicalType.getScale();
                return isDictionary ? new TypesFromInt32DecimalPageReader(dictionary, length, scale, hivePrecision, hiveScale) : new TypesFromInt32DecimalPageReader(valuesReader, length, scale, hivePrecision, hiveScale);
            } else {
                return isDictionary ? new TypesFromInt32PageReader(dictionary, length, hivePrecision, hiveScale) : new TypesFromInt32PageReader(valuesReader, length, hivePrecision, hiveScale);
            }
        case INT64:
            LogicalTypeAnnotation logicalType = parquetType.getLogicalTypeAnnotation();
            if (logicalType instanceof TimestampLogicalTypeAnnotation) {
                TimestampLogicalTypeAnnotation timestampLogicalType = (TimestampLogicalTypeAnnotation) logicalType;
                boolean isAdjustedToUTC = timestampLogicalType.isAdjustedToUTC();
                TimeUnit timeUnit = timestampLogicalType.getUnit();
                return isDictionary ? new TypesFromInt64PageReader(dictionary, length, isAdjustedToUTC, timeUnit) : new TypesFromInt64PageReader(valuesReader, length, isAdjustedToUTC, timeUnit);
            }
            if (ETypeConverter.isUnsignedInteger(parquetType)) {
                return isDictionary ? new TypesFromUInt64PageReader(dictionary, length, hivePrecision, hiveScale) : new TypesFromUInt64PageReader(valuesReader, length, hivePrecision, hiveScale);
            }
            if (logicalType instanceof DecimalLogicalTypeAnnotation) {
                DecimalLogicalTypeAnnotation decimalLogicalType = (DecimalLogicalTypeAnnotation) logicalType;
                final short scale = (short) decimalLogicalType.getScale();
                return isDictionary ? new TypesFromInt64DecimalPageReader(dictionary, length, scale, hivePrecision, hiveScale) : new TypesFromInt64DecimalPageReader(valuesReader, length, scale, hivePrecision, hiveScale);
            }
            return isDictionary ? new TypesFromInt64PageReader(dictionary, length, hivePrecision, hiveScale) : new TypesFromInt64PageReader(valuesReader, length, hivePrecision, hiveScale);
        case FLOAT:
            return isDictionary ? new TypesFromFloatPageReader(dictionary, length, hivePrecision, hiveScale) : new TypesFromFloatPageReader(valuesReader, length, hivePrecision, hiveScale);
        case INT96:
            ZoneId targetZone = skipTimestampConversion ? ZoneOffset.UTC : firstNonNull(writerTimezone, TimeZone.getDefault().toZoneId());
            return isDictionary ? new TypesFromInt96PageReader(dictionary, length, targetZone, legacyConversionEnabled) : new TypesFromInt96PageReader(valuesReader, length, targetZone, legacyConversionEnabled);
        case BOOLEAN:
            return isDictionary ? new TypesFromBooleanPageReader(dictionary, length) : new TypesFromBooleanPageReader(valuesReader, length);
        case BINARY:
        case FIXED_LEN_BYTE_ARRAY:
            return getConvertorFromBinary(isDictionary, parquetType, hiveType, valuesReader, dictionary);
        case DOUBLE:
            return isDictionary ? new TypesFromDoublePageReader(dictionary, length, hivePrecision, hiveScale) : new TypesFromDoublePageReader(valuesReader, length, hivePrecision, hiveScale);
        default:
            return isDictionary ? new DefaultParquetDataColumnReader(dictionary, length, hivePrecision, hiveScale) : new DefaultParquetDataColumnReader(valuesReader, length, hivePrecision, hiveScale);
    }
}
Also used : DecimalLogicalTypeAnnotation(org.apache.parquet.schema.LogicalTypeAnnotation.DecimalLogicalTypeAnnotation) ZoneId(java.time.ZoneId) MapTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo) ListTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo) PrimitiveTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo) DecimalTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) VarcharTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo) CharTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo) StringLogicalTypeAnnotation(org.apache.parquet.schema.LogicalTypeAnnotation.StringLogicalTypeAnnotation) DecimalLogicalTypeAnnotation(org.apache.parquet.schema.LogicalTypeAnnotation.DecimalLogicalTypeAnnotation) LogicalTypeAnnotation(org.apache.parquet.schema.LogicalTypeAnnotation) TimestampLogicalTypeAnnotation(org.apache.parquet.schema.LogicalTypeAnnotation.TimestampLogicalTypeAnnotation) ListTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo) TimestampLogicalTypeAnnotation(org.apache.parquet.schema.LogicalTypeAnnotation.TimestampLogicalTypeAnnotation) TimeUnit(org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit)

Aggregations

LogicalTypeAnnotation (org.apache.parquet.schema.LogicalTypeAnnotation)6 TypeInfo (org.apache.hadoop.hive.serde2.typeinfo.TypeInfo)4 CharTypeInfo (org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo)2 DecimalTypeInfo (org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo)2 ListTypeInfo (org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo)2 MapTypeInfo (org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo)2 PrimitiveTypeInfo (org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo)2 VarcharTypeInfo (org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo)2 GroupType (org.apache.parquet.schema.GroupType)2 DecimalLogicalTypeAnnotation (org.apache.parquet.schema.LogicalTypeAnnotation.DecimalLogicalTypeAnnotation)2 StringLogicalTypeAnnotation (org.apache.parquet.schema.LogicalTypeAnnotation.StringLogicalTypeAnnotation)2 TimestampLogicalTypeAnnotation (org.apache.parquet.schema.LogicalTypeAnnotation.TimestampLogicalTypeAnnotation)2 MessageType (org.apache.parquet.schema.MessageType)2 Type (org.apache.parquet.schema.Type)2 ZoneId (java.time.ZoneId)1 Optional (java.util.Optional)1 ArrayType (org.apache.flink.table.types.logical.ArrayType)1 DecimalType (org.apache.flink.table.types.logical.DecimalType)1 LocalZonedTimestampType (org.apache.flink.table.types.logical.LocalZonedTimestampType)1 MapType (org.apache.flink.table.types.logical.MapType)1