Search in sources :

Example 26 with PrimitiveType

use of org.apache.parquet.schema.PrimitiveType in project parquet-mr by apache.

the class TestAvroSchemaConverter method testTimestampMillisType.

@Test
public void testTimestampMillisType() throws Exception {
    Schema date = LogicalTypes.timestampMillis().addToSchema(Schema.create(LONG));
    Schema expected = Schema.createRecord("myrecord", null, null, false, Arrays.asList(new Schema.Field("timestamp", date, null, null)));
    testRoundTripConversion(expected, "message myrecord {\n" + "  required int64 timestamp (TIMESTAMP(MILLIS,true));\n" + "}\n");
    for (PrimitiveTypeName primitive : new PrimitiveTypeName[] { INT32, INT96, FLOAT, DOUBLE, BOOLEAN, BINARY, FIXED_LEN_BYTE_ARRAY }) {
        final PrimitiveType type;
        if (primitive == FIXED_LEN_BYTE_ARRAY) {
            type = new PrimitiveType(REQUIRED, primitive, 12, "test", TIMESTAMP_MILLIS);
        } else {
            type = new PrimitiveType(REQUIRED, primitive, "test", TIMESTAMP_MILLIS);
        }
        assertThrows("Should not allow TIMESTAMP_MILLIS with " + primitive, IllegalArgumentException.class, () -> new AvroSchemaConverter().convert(message(type)));
    }
}
Also used : AvroTestUtil.optionalField(org.apache.parquet.avro.AvroTestUtil.optionalField) Schema(org.apache.avro.Schema) PrimitiveType(org.apache.parquet.schema.PrimitiveType) PrimitiveTypeName(org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName) Test(org.junit.Test)

Example 27 with PrimitiveType

use of org.apache.parquet.schema.PrimitiveType in project parquet-mr by apache.

the class TestAvroSchemaConverter method testTimestampMicrosType.

@Test
public void testTimestampMicrosType() throws Exception {
    Schema date = LogicalTypes.timestampMicros().addToSchema(Schema.create(LONG));
    Schema expected = Schema.createRecord("myrecord", null, null, false, Arrays.asList(new Schema.Field("timestamp", date, null, null)));
    testRoundTripConversion(expected, "message myrecord {\n" + "  required int64 timestamp (TIMESTAMP(MICROS,true));\n" + "}\n");
    for (PrimitiveTypeName primitive : new PrimitiveTypeName[] { INT32, INT96, FLOAT, DOUBLE, BOOLEAN, BINARY, FIXED_LEN_BYTE_ARRAY }) {
        final PrimitiveType type;
        if (primitive == FIXED_LEN_BYTE_ARRAY) {
            type = new PrimitiveType(REQUIRED, primitive, 12, "test", TIMESTAMP_MICROS);
        } else {
            type = new PrimitiveType(REQUIRED, primitive, "test", TIMESTAMP_MICROS);
        }
        assertThrows("Should not allow TIMESTAMP_MICROS with " + primitive, IllegalArgumentException.class, () -> new AvroSchemaConverter().convert(message(type)));
    }
}
Also used : AvroTestUtil.optionalField(org.apache.parquet.avro.AvroTestUtil.optionalField) Schema(org.apache.avro.Schema) PrimitiveType(org.apache.parquet.schema.PrimitiveType) PrimitiveTypeName(org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName) Test(org.junit.Test)

Example 28 with PrimitiveType

use of org.apache.parquet.schema.PrimitiveType in project parquet-mr by apache.

the class AvroSchemaConverter method convertField.

@SuppressWarnings("deprecation")
private Type convertField(String fieldName, Schema schema, Type.Repetition repetition, String schemaPath) {
    Types.PrimitiveBuilder<PrimitiveType> builder;
    Schema.Type type = schema.getType();
    LogicalType logicalType = schema.getLogicalType();
    if (type.equals(Schema.Type.BOOLEAN)) {
        builder = Types.primitive(BOOLEAN, repetition);
    } else if (type.equals(Schema.Type.INT)) {
        builder = Types.primitive(INT32, repetition);
    } else if (type.equals(Schema.Type.LONG)) {
        builder = Types.primitive(INT64, repetition);
    } else if (type.equals(Schema.Type.FLOAT)) {
        builder = Types.primitive(FLOAT, repetition);
    } else if (type.equals(Schema.Type.DOUBLE)) {
        builder = Types.primitive(DOUBLE, repetition);
    } else if (type.equals(Schema.Type.BYTES)) {
        builder = Types.primitive(BINARY, repetition);
    } else if (type.equals(Schema.Type.STRING)) {
        if (logicalType != null && logicalType.getName().equals(LogicalTypes.uuid().getName()) && writeParquetUUID) {
            builder = Types.primitive(FIXED_LEN_BYTE_ARRAY, repetition).length(LogicalTypeAnnotation.UUIDLogicalTypeAnnotation.BYTES);
        } else {
            builder = Types.primitive(BINARY, repetition).as(stringType());
        }
    } else if (type.equals(Schema.Type.RECORD)) {
        return new GroupType(repetition, fieldName, convertFields(schema.getFields(), schemaPath));
    } else if (type.equals(Schema.Type.ENUM)) {
        builder = Types.primitive(BINARY, repetition).as(enumType());
    } else if (type.equals(Schema.Type.ARRAY)) {
        if (writeOldListStructure) {
            return ConversionPatterns.listType(repetition, fieldName, convertField("array", schema.getElementType(), REPEATED, schemaPath));
        } else {
            return ConversionPatterns.listOfElements(repetition, fieldName, convertField(AvroWriteSupport.LIST_ELEMENT_NAME, schema.getElementType(), schemaPath));
        }
    } else if (type.equals(Schema.Type.MAP)) {
        Type valType = convertField("value", schema.getValueType(), schemaPath);
        // avro map key type is always string
        return ConversionPatterns.stringKeyMapType(repetition, fieldName, valType);
    } else if (type.equals(Schema.Type.FIXED)) {
        if (pathsToInt96.contains(schemaPath)) {
            if (schema.getFixedSize() != 12) {
                throw new IllegalArgumentException("The size of the fixed type field " + schemaPath + " must be 12 bytes for INT96 conversion");
            }
            builder = Types.primitive(PrimitiveTypeName.INT96, repetition);
        } else {
            builder = Types.primitive(FIXED_LEN_BYTE_ARRAY, repetition).length(schema.getFixedSize());
        }
    } else if (type.equals(Schema.Type.UNION)) {
        return convertUnion(fieldName, schema, repetition, schemaPath);
    } else {
        throw new UnsupportedOperationException("Cannot convert Avro type " + type);
    }
    // creates an equivalence
    if (logicalType != null) {
        if (logicalType instanceof LogicalTypes.Decimal) {
            LogicalTypes.Decimal decimal = (LogicalTypes.Decimal) logicalType;
            builder = builder.as(decimalType(decimal.getScale(), decimal.getPrecision()));
        } else {
            LogicalTypeAnnotation annotation = convertLogicalType(logicalType);
            if (annotation != null) {
                builder.as(annotation);
            }
        }
    }
    return builder.named(fieldName);
}
Also used : LogicalTypes(org.apache.avro.LogicalTypes) Types(org.apache.parquet.schema.Types) Schema(org.apache.avro.Schema) LogicalType(org.apache.avro.LogicalType) LogicalTypes(org.apache.avro.LogicalTypes) PrimitiveType(org.apache.parquet.schema.PrimitiveType) LogicalTypeAnnotation.timeType(org.apache.parquet.schema.LogicalTypeAnnotation.timeType) LogicalTypeAnnotation.timestampType(org.apache.parquet.schema.LogicalTypeAnnotation.timestampType) LogicalTypeAnnotation.decimalType(org.apache.parquet.schema.LogicalTypeAnnotation.decimalType) LogicalTypeAnnotation.dateType(org.apache.parquet.schema.LogicalTypeAnnotation.dateType) LogicalTypeAnnotation.enumType(org.apache.parquet.schema.LogicalTypeAnnotation.enumType) GroupType(org.apache.parquet.schema.GroupType) LogicalType(org.apache.avro.LogicalType) LogicalTypeAnnotation.uuidType(org.apache.parquet.schema.LogicalTypeAnnotation.uuidType) MessageType(org.apache.parquet.schema.MessageType) LogicalTypeAnnotation.stringType(org.apache.parquet.schema.LogicalTypeAnnotation.stringType) Type(org.apache.parquet.schema.Type) GroupType(org.apache.parquet.schema.GroupType) LogicalTypeAnnotation(org.apache.parquet.schema.LogicalTypeAnnotation) UUIDLogicalTypeAnnotation(org.apache.parquet.schema.LogicalTypeAnnotation.UUIDLogicalTypeAnnotation) PrimitiveType(org.apache.parquet.schema.PrimitiveType)

Example 29 with PrimitiveType

use of org.apache.parquet.schema.PrimitiveType in project parquet-mr by apache.

the class AvroSchemaConverter method convertField.

private Schema convertField(final Type parquetType) {
    if (parquetType.isPrimitive()) {
        final PrimitiveType asPrimitive = parquetType.asPrimitiveType();
        final PrimitiveTypeName parquetPrimitiveTypeName = asPrimitive.getPrimitiveTypeName();
        final OriginalType annotation = parquetType.getOriginalType();
        Schema schema = parquetPrimitiveTypeName.convert(new PrimitiveType.PrimitiveTypeNameConverter<Schema, RuntimeException>() {

            @Override
            public Schema convertBOOLEAN(PrimitiveTypeName primitiveTypeName) {
                return Schema.create(Schema.Type.BOOLEAN);
            }

            @Override
            public Schema convertINT32(PrimitiveTypeName primitiveTypeName) {
                return Schema.create(Schema.Type.INT);
            }

            @Override
            public Schema convertINT64(PrimitiveTypeName primitiveTypeName) {
                return Schema.create(Schema.Type.LONG);
            }

            @Override
            public Schema convertINT96(PrimitiveTypeName primitiveTypeName) {
                throw new IllegalArgumentException("INT96 not yet implemented.");
            }

            @Override
            public Schema convertFLOAT(PrimitiveTypeName primitiveTypeName) {
                return Schema.create(Schema.Type.FLOAT);
            }

            @Override
            public Schema convertDOUBLE(PrimitiveTypeName primitiveTypeName) {
                return Schema.create(Schema.Type.DOUBLE);
            }

            @Override
            public Schema convertFIXED_LEN_BYTE_ARRAY(PrimitiveTypeName primitiveTypeName) {
                int size = parquetType.asPrimitiveType().getTypeLength();
                return Schema.createFixed(parquetType.getName(), null, null, size);
            }

            @Override
            public Schema convertBINARY(PrimitiveTypeName primitiveTypeName) {
                if (annotation == OriginalType.UTF8 || annotation == OriginalType.ENUM) {
                    return Schema.create(Schema.Type.STRING);
                } else {
                    return Schema.create(Schema.Type.BYTES);
                }
            }
        });
        LogicalType logicalType = convertOriginalType(annotation, asPrimitive.getDecimalMetadata());
        if (logicalType != null && (annotation != DECIMAL || parquetPrimitiveTypeName == BINARY || parquetPrimitiveTypeName == FIXED_LEN_BYTE_ARRAY)) {
            schema = logicalType.addToSchema(schema);
        }
        return schema;
    } else {
        GroupType parquetGroupType = parquetType.asGroupType();
        OriginalType originalType = parquetGroupType.getOriginalType();
        if (originalType != null) {
            switch(originalType) {
                case LIST:
                    if (parquetGroupType.getFieldCount() != 1) {
                        throw new UnsupportedOperationException("Invalid list type " + parquetGroupType);
                    }
                    Type repeatedType = parquetGroupType.getType(0);
                    if (!repeatedType.isRepetition(REPEATED)) {
                        throw new UnsupportedOperationException("Invalid list type " + parquetGroupType);
                    }
                    if (isElementType(repeatedType, parquetGroupType.getName())) {
                        // repeated element types are always required
                        return Schema.createArray(convertField(repeatedType));
                    } else {
                        Type elementType = repeatedType.asGroupType().getType(0);
                        if (elementType.isRepetition(Type.Repetition.OPTIONAL)) {
                            return Schema.createArray(optional(convertField(elementType)));
                        } else {
                            return Schema.createArray(convertField(elementType));
                        }
                    }
                // for backward-compatibility
                case MAP_KEY_VALUE:
                case MAP:
                    if (parquetGroupType.getFieldCount() != 1 || parquetGroupType.getType(0).isPrimitive()) {
                        throw new UnsupportedOperationException("Invalid map type " + parquetGroupType);
                    }
                    GroupType mapKeyValType = parquetGroupType.getType(0).asGroupType();
                    if (!mapKeyValType.isRepetition(REPEATED) || mapKeyValType.getFieldCount() != 2) {
                        throw new UnsupportedOperationException("Invalid map type " + parquetGroupType);
                    }
                    Type keyType = mapKeyValType.getType(0);
                    if (!keyType.isPrimitive() || !keyType.asPrimitiveType().getPrimitiveTypeName().equals(PrimitiveTypeName.BINARY) || !keyType.getOriginalType().equals(OriginalType.UTF8)) {
                        throw new IllegalArgumentException("Map key type must be binary (UTF8): " + keyType);
                    }
                    Type valueType = mapKeyValType.getType(1);
                    if (valueType.isRepetition(Type.Repetition.OPTIONAL)) {
                        return Schema.createMap(optional(convertField(valueType)));
                    } else {
                        return Schema.createMap(convertField(valueType));
                    }
                case ENUM:
                    return Schema.create(Schema.Type.STRING);
                case UTF8:
                default:
                    throw new UnsupportedOperationException("Cannot convert Parquet type " + parquetType);
            }
        } else {
            // if no original type then it's a record
            return convertFields(parquetGroupType.getName(), parquetGroupType.getFields());
        }
    }
}
Also used : OriginalType(org.apache.parquet.schema.OriginalType) PrimitiveType(org.apache.parquet.schema.PrimitiveType) GroupType(org.apache.parquet.schema.GroupType) LogicalType(org.apache.avro.LogicalType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) OriginalType(org.apache.parquet.schema.OriginalType) GroupType(org.apache.parquet.schema.GroupType) Schema(org.apache.avro.Schema) LogicalType(org.apache.avro.LogicalType) PrimitiveType(org.apache.parquet.schema.PrimitiveType) PrimitiveTypeName(org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName)

Example 30 with PrimitiveType

use of org.apache.parquet.schema.PrimitiveType in project parquet-mr by apache.

the class TestJsonRecordFormatter method testFlatSchemaWithArrays.

@Test
public void testFlatSchemaWithArrays() throws Exception {
    SimpleRecord simple = new SimpleRecord();
    MessageType schema = new MessageType("schema", new PrimitiveType(Type.Repetition.REQUIRED, PrimitiveType.PrimitiveTypeName.BINARY, "reqd"), new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveType.PrimitiveTypeName.DOUBLE, "opt"), new PrimitiveType(Type.Repetition.REPEATED, PrimitiveType.PrimitiveTypeName.INT32, "odd"), new PrimitiveType(Type.Repetition.REPEATED, PrimitiveType.PrimitiveTypeName.INT64, "even"));
    simple.values.add(kv("reqd", "a required value"));
    simple.values.add(kv("opt", 1.2345));
    simple.values.add(kv("odd", 1));
    simple.values.add(kv("odd", 3));
    simple.values.add(kv("odd", 5));
    simple.values.add(kv("odd", 7));
    simple.values.add(kv("odd", 9));
    simple.values.add(kv("even", 2));
    simple.values.add(kv("even", 4));
    simple.values.add(kv("even", 6));
    simple.values.add(kv("even", 8));
    simple.values.add(kv("even", 10));
    String expected = asJsonString(obj(entry("reqd", "a required value"), entry("opt", 1.2345), entry("odd", array(1, 3, 5, 7, 9)), entry("even", array(2, 4, 6, 8, 10))));
    String actual = JsonRecordFormatter.fromSchema(schema).formatRecord(simple);
    assertEquals(expected, actual);
}
Also used : PrimitiveType(org.apache.parquet.schema.PrimitiveType) MessageType(org.apache.parquet.schema.MessageType) Test(org.junit.Test)

Aggregations

PrimitiveType (org.apache.parquet.schema.PrimitiveType)123 Test (org.junit.Test)66 MessageType (org.apache.parquet.schema.MessageType)41 HiveDecimalWritable (org.apache.hadoop.hive.serde2.io.HiveDecimalWritable)28 BooleanWritable (org.apache.hadoop.io.BooleanWritable)28 BytesWritable (org.apache.hadoop.io.BytesWritable)28 DoubleWritable (org.apache.hadoop.io.DoubleWritable)28 FloatWritable (org.apache.hadoop.io.FloatWritable)28 IntWritable (org.apache.hadoop.io.IntWritable)28 LongWritable (org.apache.hadoop.io.LongWritable)28 Writable (org.apache.hadoop.io.Writable)28 GroupType (org.apache.parquet.schema.GroupType)25 Test (org.testng.annotations.Test)20 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)14 OriginalType (org.apache.parquet.schema.OriginalType)14 Type (org.apache.parquet.schema.Type)12 List (java.util.List)11 ColumnIndex (org.apache.parquet.internal.column.columnindex.ColumnIndex)11 ColumnIndexBuilder (org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder)11 ArrayList (java.util.ArrayList)10