Search in sources :

Example 16 with Type

use of org.apache.parquet.schema.Type in project parquet-mr by apache.

the class AvroSchemaConverter method convertField.

@SuppressWarnings("deprecation")
private Type convertField(String fieldName, Schema schema, Type.Repetition repetition) {
    Types.PrimitiveBuilder<PrimitiveType> builder;
    Schema.Type type = schema.getType();
    if (type.equals(Schema.Type.BOOLEAN)) {
        builder = Types.primitive(BOOLEAN, repetition);
    } else if (type.equals(Schema.Type.INT)) {
        builder = Types.primitive(INT32, repetition);
    } else if (type.equals(Schema.Type.LONG)) {
        builder = Types.primitive(INT64, repetition);
    } else if (type.equals(Schema.Type.FLOAT)) {
        builder = Types.primitive(FLOAT, repetition);
    } else if (type.equals(Schema.Type.DOUBLE)) {
        builder = Types.primitive(DOUBLE, repetition);
    } else if (type.equals(Schema.Type.BYTES)) {
        builder = Types.primitive(BINARY, repetition);
    } else if (type.equals(Schema.Type.STRING)) {
        builder = Types.primitive(BINARY, repetition).as(UTF8);
    } else if (type.equals(Schema.Type.RECORD)) {
        return new GroupType(repetition, fieldName, convertFields(schema.getFields()));
    } else if (type.equals(Schema.Type.ENUM)) {
        builder = Types.primitive(BINARY, repetition).as(ENUM);
    } else if (type.equals(Schema.Type.ARRAY)) {
        if (writeOldListStructure) {
            return ConversionPatterns.listType(repetition, fieldName, convertField("array", schema.getElementType(), REPEATED));
        } else {
            return ConversionPatterns.listOfElements(repetition, fieldName, convertField(AvroWriteSupport.LIST_ELEMENT_NAME, schema.getElementType()));
        }
    } else if (type.equals(Schema.Type.MAP)) {
        Type valType = convertField("value", schema.getValueType());
        // avro map key type is always string
        return ConversionPatterns.stringKeyMapType(repetition, fieldName, valType);
    } else if (type.equals(Schema.Type.FIXED)) {
        builder = Types.primitive(FIXED_LEN_BYTE_ARRAY, repetition).length(schema.getFixedSize());
    } else if (type.equals(Schema.Type.UNION)) {
        return convertUnion(fieldName, schema, repetition);
    } else {
        throw new UnsupportedOperationException("Cannot convert Avro type " + type);
    }
    // schema translation can only be done for known logical types because this
    // creates an equivalence
    LogicalType logicalType = schema.getLogicalType();
    if (logicalType != null) {
        if (logicalType instanceof LogicalTypes.Decimal) {
            builder = builder.as(DECIMAL).precision(((LogicalTypes.Decimal) logicalType).getPrecision()).scale(((LogicalTypes.Decimal) logicalType).getScale());
        } else {
            OriginalType annotation = convertLogicalType(logicalType);
            if (annotation != null) {
                builder.as(annotation);
            }
        }
    }
    return builder.named(fieldName);
}
Also used : Types(org.apache.parquet.schema.Types) LogicalTypes(org.apache.avro.LogicalTypes) OriginalType(org.apache.parquet.schema.OriginalType) PrimitiveType(org.apache.parquet.schema.PrimitiveType) GroupType(org.apache.parquet.schema.GroupType) LogicalType(org.apache.avro.LogicalType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) OriginalType(org.apache.parquet.schema.OriginalType) GroupType(org.apache.parquet.schema.GroupType) Schema(org.apache.avro.Schema) LogicalType(org.apache.avro.LogicalType) LogicalTypes(org.apache.avro.LogicalTypes) PrimitiveType(org.apache.parquet.schema.PrimitiveType)

Example 17 with Type

use of org.apache.parquet.schema.Type in project parquet-mr by apache.

the class AvroSchemaConverter method convertField.

private Schema convertField(final Type parquetType) {
    if (parquetType.isPrimitive()) {
        final PrimitiveType asPrimitive = parquetType.asPrimitiveType();
        final PrimitiveTypeName parquetPrimitiveTypeName = asPrimitive.getPrimitiveTypeName();
        final OriginalType annotation = parquetType.getOriginalType();
        Schema schema = parquetPrimitiveTypeName.convert(new PrimitiveType.PrimitiveTypeNameConverter<Schema, RuntimeException>() {

            @Override
            public Schema convertBOOLEAN(PrimitiveTypeName primitiveTypeName) {
                return Schema.create(Schema.Type.BOOLEAN);
            }

            @Override
            public Schema convertINT32(PrimitiveTypeName primitiveTypeName) {
                return Schema.create(Schema.Type.INT);
            }

            @Override
            public Schema convertINT64(PrimitiveTypeName primitiveTypeName) {
                return Schema.create(Schema.Type.LONG);
            }

            @Override
            public Schema convertINT96(PrimitiveTypeName primitiveTypeName) {
                throw new IllegalArgumentException("INT96 not yet implemented.");
            }

            @Override
            public Schema convertFLOAT(PrimitiveTypeName primitiveTypeName) {
                return Schema.create(Schema.Type.FLOAT);
            }

            @Override
            public Schema convertDOUBLE(PrimitiveTypeName primitiveTypeName) {
                return Schema.create(Schema.Type.DOUBLE);
            }

            @Override
            public Schema convertFIXED_LEN_BYTE_ARRAY(PrimitiveTypeName primitiveTypeName) {
                int size = parquetType.asPrimitiveType().getTypeLength();
                return Schema.createFixed(parquetType.getName(), null, null, size);
            }

            @Override
            public Schema convertBINARY(PrimitiveTypeName primitiveTypeName) {
                if (annotation == OriginalType.UTF8 || annotation == OriginalType.ENUM) {
                    return Schema.create(Schema.Type.STRING);
                } else {
                    return Schema.create(Schema.Type.BYTES);
                }
            }
        });
        LogicalType logicalType = convertOriginalType(annotation, asPrimitive.getDecimalMetadata());
        if (logicalType != null && (annotation != DECIMAL || parquetPrimitiveTypeName == BINARY || parquetPrimitiveTypeName == FIXED_LEN_BYTE_ARRAY)) {
            schema = logicalType.addToSchema(schema);
        }
        return schema;
    } else {
        GroupType parquetGroupType = parquetType.asGroupType();
        OriginalType originalType = parquetGroupType.getOriginalType();
        if (originalType != null) {
            switch(originalType) {
                case LIST:
                    if (parquetGroupType.getFieldCount() != 1) {
                        throw new UnsupportedOperationException("Invalid list type " + parquetGroupType);
                    }
                    Type repeatedType = parquetGroupType.getType(0);
                    if (!repeatedType.isRepetition(REPEATED)) {
                        throw new UnsupportedOperationException("Invalid list type " + parquetGroupType);
                    }
                    if (isElementType(repeatedType, parquetGroupType.getName())) {
                        // repeated element types are always required
                        return Schema.createArray(convertField(repeatedType));
                    } else {
                        Type elementType = repeatedType.asGroupType().getType(0);
                        if (elementType.isRepetition(Type.Repetition.OPTIONAL)) {
                            return Schema.createArray(optional(convertField(elementType)));
                        } else {
                            return Schema.createArray(convertField(elementType));
                        }
                    }
                // for backward-compatibility
                case MAP_KEY_VALUE:
                case MAP:
                    if (parquetGroupType.getFieldCount() != 1 || parquetGroupType.getType(0).isPrimitive()) {
                        throw new UnsupportedOperationException("Invalid map type " + parquetGroupType);
                    }
                    GroupType mapKeyValType = parquetGroupType.getType(0).asGroupType();
                    if (!mapKeyValType.isRepetition(REPEATED) || mapKeyValType.getFieldCount() != 2) {
                        throw new UnsupportedOperationException("Invalid map type " + parquetGroupType);
                    }
                    Type keyType = mapKeyValType.getType(0);
                    if (!keyType.isPrimitive() || !keyType.asPrimitiveType().getPrimitiveTypeName().equals(PrimitiveTypeName.BINARY) || !keyType.getOriginalType().equals(OriginalType.UTF8)) {
                        throw new IllegalArgumentException("Map key type must be binary (UTF8): " + keyType);
                    }
                    Type valueType = mapKeyValType.getType(1);
                    if (valueType.isRepetition(Type.Repetition.OPTIONAL)) {
                        return Schema.createMap(optional(convertField(valueType)));
                    } else {
                        return Schema.createMap(convertField(valueType));
                    }
                case ENUM:
                    return Schema.create(Schema.Type.STRING);
                case UTF8:
                default:
                    throw new UnsupportedOperationException("Cannot convert Parquet type " + parquetType);
            }
        } else {
            // if no original type then it's a record
            return convertFields(parquetGroupType.getName(), parquetGroupType.getFields());
        }
    }
}
Also used : OriginalType(org.apache.parquet.schema.OriginalType) PrimitiveType(org.apache.parquet.schema.PrimitiveType) GroupType(org.apache.parquet.schema.GroupType) LogicalType(org.apache.avro.LogicalType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) OriginalType(org.apache.parquet.schema.OriginalType) GroupType(org.apache.parquet.schema.GroupType) Schema(org.apache.avro.Schema) LogicalType(org.apache.avro.LogicalType) PrimitiveType(org.apache.parquet.schema.PrimitiveType) PrimitiveTypeName(org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName)

Example 18 with Type

use of org.apache.parquet.schema.Type in project parquet-mr by apache.

the class ColumnReadStoreImpl method getPrimitiveConverter.

private PrimitiveConverter getPrimitiveConverter(ColumnDescriptor path) {
    Type currentType = schema;
    Converter currentConverter = recordConverter;
    for (String fieldName : path.getPath()) {
        final GroupType groupType = currentType.asGroupType();
        int fieldIndex = groupType.getFieldIndex(fieldName);
        currentType = groupType.getType(fieldName);
        currentConverter = currentConverter.asGroupConverter().getConverter(fieldIndex);
    }
    PrimitiveConverter converter = currentConverter.asPrimitiveConverter();
    return converter;
}
Also used : GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) GroupType(org.apache.parquet.schema.GroupType) PrimitiveConverter(org.apache.parquet.io.api.PrimitiveConverter) Converter(org.apache.parquet.io.api.Converter) GroupConverter(org.apache.parquet.io.api.GroupConverter) PrimitiveConverter(org.apache.parquet.io.api.PrimitiveConverter)

Example 19 with Type

use of org.apache.parquet.schema.Type in project parquet-mr by apache.

the class DataWritableWriter method writeArray.

private void writeArray(final ArrayWritable array, final GroupType type) {
    if (array == null) {
        return;
    }
    final Writable[] subValues = array.get();
    final int fieldCount = type.getFieldCount();
    for (int field = 0; field < fieldCount; ++field) {
        final Type subType = type.getType(field);
        recordConsumer.startField(subType.getName(), field);
        for (int i = 0; i < subValues.length; ++i) {
            final Writable subValue = subValues[i];
            if (subValue != null) {
                if (subType.isPrimitive()) {
                    if (subValue instanceof ArrayWritable) {
                        // 0 ?
                        writePrimitive(((ArrayWritable) subValue).get()[field]);
                    } else {
                        writePrimitive(subValue);
                    }
                } else {
                    if (!(subValue instanceof ArrayWritable)) {
                        throw new RuntimeException("This should be a ArrayWritable: " + subValue);
                    } else {
                        recordConsumer.startGroup();
                        writeData((ArrayWritable) subValue, subType.asGroupType());
                        recordConsumer.endGroup();
                    }
                }
            }
        }
        recordConsumer.endField(subType.getName(), field);
    }
}
Also used : GroupType(org.apache.parquet.schema.GroupType) Type(org.apache.parquet.schema.Type) ArrayWritable(org.apache.hadoop.io.ArrayWritable) ByteWritable(org.apache.hadoop.hive.serde2.io.ByteWritable) BigDecimalWritable(org.apache.hadoop.hive.ql.io.parquet.writable.BigDecimalWritable) BinaryWritable(org.apache.hadoop.hive.ql.io.parquet.writable.BinaryWritable) Writable(org.apache.hadoop.io.Writable) LongWritable(org.apache.hadoop.io.LongWritable) BooleanWritable(org.apache.hadoop.io.BooleanWritable) DoubleWritable(org.apache.hadoop.hive.serde2.io.DoubleWritable) ShortWritable(org.apache.hadoop.hive.serde2.io.ShortWritable) ArrayWritable(org.apache.hadoop.io.ArrayWritable) FloatWritable(org.apache.hadoop.io.FloatWritable) IntWritable(org.apache.hadoop.io.IntWritable)

Example 20 with Type

use of org.apache.parquet.schema.Type in project parquet-mr by apache.

the class ProtoSchemaConverter method addField.

private <T> Builder<? extends Builder<?, GroupBuilder<T>>, GroupBuilder<T>> addField(Descriptors.FieldDescriptor descriptor, GroupBuilder<T> builder) {
    Type.Repetition repetition = getRepetition(descriptor);
    JavaType javaType = descriptor.getJavaType();
    switch(javaType) {
        case BOOLEAN:
            return builder.primitive(BOOLEAN, repetition);
        case INT:
            return builder.primitive(INT32, repetition);
        case LONG:
            return builder.primitive(INT64, repetition);
        case FLOAT:
            return builder.primitive(FLOAT, repetition);
        case DOUBLE:
            return builder.primitive(DOUBLE, repetition);
        case BYTE_STRING:
            return builder.primitive(BINARY, repetition);
        case STRING:
            return builder.primitive(BINARY, repetition).as(UTF8);
        case MESSAGE:
            {
                GroupBuilder<GroupBuilder<T>> group = builder.group(repetition);
                convertFields(group, descriptor.getMessageType().getFields());
                return group;
            }
        case ENUM:
            return builder.primitive(BINARY, repetition).as(ENUM);
        default:
            throw new UnsupportedOperationException("Cannot convert Protocol Buffer: unknown type " + javaType);
    }
}
Also used : JavaType(com.google.protobuf.Descriptors.FieldDescriptor.JavaType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) JavaType(com.google.protobuf.Descriptors.FieldDescriptor.JavaType) FLOAT(org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FLOAT) GroupBuilder(org.apache.parquet.schema.Types.GroupBuilder)

Aggregations

Type (org.apache.parquet.schema.Type)88 MessageType (org.apache.parquet.schema.MessageType)72 GroupType (org.apache.parquet.schema.GroupType)69 OriginalType (org.apache.parquet.schema.OriginalType)35 PrimitiveType (org.apache.parquet.schema.PrimitiveType)35 ArrayList (java.util.ArrayList)25 HashMap (java.util.HashMap)10 SchemaPath (org.apache.drill.common.expression.SchemaPath)10 TypeInfo (org.apache.hadoop.hive.serde2.typeinfo.TypeInfo)10 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)10 PathSegment (org.apache.drill.common.expression.PathSegment)8 Converter (org.apache.parquet.io.api.Converter)6 GroupConverter (org.apache.parquet.io.api.GroupConverter)6 MinorType (org.apache.drill.common.types.TypeProtos.MinorType)5 MaterializedField (org.apache.drill.exec.record.MaterializedField)5 LogicalTypeAnnotation (org.apache.parquet.schema.LogicalTypeAnnotation)5 Collection (java.util.Collection)4 List (java.util.List)4 Function (java.util.function.Function)4 LogicalType (org.apache.avro.LogicalType)4