Search in sources :

Example 61 with PrimitiveType

use of org.apache.parquet.schema.PrimitiveType in project parquet-mr by apache.

the class AvroSchemaConverter method convertField.

@SuppressWarnings("deprecation")
private Type convertField(String fieldName, Schema schema, Type.Repetition repetition, String schemaPath) {
    Types.PrimitiveBuilder<PrimitiveType> builder;
    Schema.Type type = schema.getType();
    LogicalType logicalType = schema.getLogicalType();
    if (type.equals(Schema.Type.BOOLEAN)) {
        builder = Types.primitive(BOOLEAN, repetition);
    } else if (type.equals(Schema.Type.INT)) {
        builder = Types.primitive(INT32, repetition);
    } else if (type.equals(Schema.Type.LONG)) {
        builder = Types.primitive(INT64, repetition);
    } else if (type.equals(Schema.Type.FLOAT)) {
        builder = Types.primitive(FLOAT, repetition);
    } else if (type.equals(Schema.Type.DOUBLE)) {
        builder = Types.primitive(DOUBLE, repetition);
    } else if (type.equals(Schema.Type.BYTES)) {
        builder = Types.primitive(BINARY, repetition);
    } else if (type.equals(Schema.Type.STRING)) {
        if (logicalType != null && logicalType.getName().equals(LogicalTypes.uuid().getName()) && writeParquetUUID) {
            builder = Types.primitive(FIXED_LEN_BYTE_ARRAY, repetition).length(LogicalTypeAnnotation.UUIDLogicalTypeAnnotation.BYTES);
        } else {
            builder = Types.primitive(BINARY, repetition).as(stringType());
        }
    } else if (type.equals(Schema.Type.RECORD)) {
        return new GroupType(repetition, fieldName, convertFields(schema.getFields(), schemaPath));
    } else if (type.equals(Schema.Type.ENUM)) {
        builder = Types.primitive(BINARY, repetition).as(enumType());
    } else if (type.equals(Schema.Type.ARRAY)) {
        if (writeOldListStructure) {
            return ConversionPatterns.listType(repetition, fieldName, convertField("array", schema.getElementType(), REPEATED, schemaPath));
        } else {
            return ConversionPatterns.listOfElements(repetition, fieldName, convertField(AvroWriteSupport.LIST_ELEMENT_NAME, schema.getElementType(), schemaPath));
        }
    } else if (type.equals(Schema.Type.MAP)) {
        Type valType = convertField("value", schema.getValueType(), schemaPath);
        // avro map key type is always string
        return ConversionPatterns.stringKeyMapType(repetition, fieldName, valType);
    } else if (type.equals(Schema.Type.FIXED)) {
        if (pathsToInt96.contains(schemaPath)) {
            if (schema.getFixedSize() != 12) {
                throw new IllegalArgumentException("The size of the fixed type field " + schemaPath + " must be 12 bytes for INT96 conversion");
            }
            builder = Types.primitive(PrimitiveTypeName.INT96, repetition);
        } else {
            builder = Types.primitive(FIXED_LEN_BYTE_ARRAY, repetition).length(schema.getFixedSize());
        }
    } else if (type.equals(Schema.Type.UNION)) {
        return convertUnion(fieldName, schema, repetition, schemaPath);
    } else {
        throw new UnsupportedOperationException("Cannot convert Avro type " + type);
    }
    // creates an equivalence
    if (logicalType != null) {
        if (logicalType instanceof LogicalTypes.Decimal) {
            LogicalTypes.Decimal decimal = (LogicalTypes.Decimal) logicalType;
            builder = builder.as(decimalType(decimal.getScale(), decimal.getPrecision()));
        } else {
            LogicalTypeAnnotation annotation = convertLogicalType(logicalType);
            if (annotation != null) {
                builder.as(annotation);
            }
        }
    }
    return builder.named(fieldName);
}
Also used : LogicalTypes(org.apache.avro.LogicalTypes) Types(org.apache.parquet.schema.Types) Schema(org.apache.avro.Schema) LogicalType(org.apache.avro.LogicalType) LogicalTypes(org.apache.avro.LogicalTypes) PrimitiveType(org.apache.parquet.schema.PrimitiveType) LogicalTypeAnnotation.timeType(org.apache.parquet.schema.LogicalTypeAnnotation.timeType) LogicalTypeAnnotation.timestampType(org.apache.parquet.schema.LogicalTypeAnnotation.timestampType) LogicalTypeAnnotation.decimalType(org.apache.parquet.schema.LogicalTypeAnnotation.decimalType) LogicalTypeAnnotation.dateType(org.apache.parquet.schema.LogicalTypeAnnotation.dateType) LogicalTypeAnnotation.enumType(org.apache.parquet.schema.LogicalTypeAnnotation.enumType) GroupType(org.apache.parquet.schema.GroupType) LogicalType(org.apache.avro.LogicalType) LogicalTypeAnnotation.uuidType(org.apache.parquet.schema.LogicalTypeAnnotation.uuidType) MessageType(org.apache.parquet.schema.MessageType) LogicalTypeAnnotation.stringType(org.apache.parquet.schema.LogicalTypeAnnotation.stringType) Type(org.apache.parquet.schema.Type) GroupType(org.apache.parquet.schema.GroupType) LogicalTypeAnnotation(org.apache.parquet.schema.LogicalTypeAnnotation) UUIDLogicalTypeAnnotation(org.apache.parquet.schema.LogicalTypeAnnotation.UUIDLogicalTypeAnnotation) PrimitiveType(org.apache.parquet.schema.PrimitiveType)

Example 62 with PrimitiveType

use of org.apache.parquet.schema.PrimitiveType in project parquet-mr by apache.

the class TestJsonRecordFormatter method testNestedGrouping.

@Test
public void testNestedGrouping() throws Exception {
    SimpleRecord simple = new SimpleRecord();
    MessageType schema = new MessageType("schema", new PrimitiveType(Type.Repetition.REPEATED, PrimitiveType.PrimitiveTypeName.BINARY, "flat-string"), new GroupType(Type.Repetition.OPTIONAL, "subgroup", new PrimitiveType(Type.Repetition.REQUIRED, PrimitiveType.PrimitiveTypeName.INT32, "flat-int"), new PrimitiveType(Type.Repetition.REPEATED, PrimitiveType.PrimitiveTypeName.BINARY, "string-list")));
    SimpleRecord subgroup = new SimpleRecord();
    subgroup.values.add(kv("flat-int", 12345));
    subgroup.values.add(kv("string-list", "two"));
    subgroup.values.add(kv("string-list", "four"));
    subgroup.values.add(kv("string-list", "six"));
    subgroup.values.add(kv("string-list", "eight"));
    subgroup.values.add(kv("string-list", "ten"));
    simple.values.add(kv("flat-string", "one"));
    simple.values.add(kv("flat-string", "two"));
    simple.values.add(kv("flat-string", "three"));
    simple.values.add(kv("flat-string", "four"));
    simple.values.add(kv("flat-string", "five"));
    simple.values.add(kv("subgroup", subgroup));
    String actual = JsonRecordFormatter.fromSchema(schema).formatRecord(simple);
    String expected = asJsonString(obj(entry("flat-string", array("one", "two", "three", "four", "five")), entry("subgroup", obj(entry("flat-int", 12345), entry("string-list", array("two", "four", "six", "eight", "ten"))))));
    assertEquals(expected, actual);
}
Also used : GroupType(org.apache.parquet.schema.GroupType) PrimitiveType(org.apache.parquet.schema.PrimitiveType) MessageType(org.apache.parquet.schema.MessageType) Test(org.junit.Test)

Example 63 with PrimitiveType

use of org.apache.parquet.schema.PrimitiveType in project parquet-mr by apache.

the class TestJsonRecordFormatter method testGroupList.

@Test
public void testGroupList() throws Exception {
    SimpleRecord simple = new SimpleRecord();
    MessageType schema = new MessageType("schema", new GroupType(Type.Repetition.REPEATED, "repeat-group", new PrimitiveType(Type.Repetition.REQUIRED, PrimitiveType.PrimitiveTypeName.INT64, "flat-int"), new PrimitiveType(Type.Repetition.REPEATED, PrimitiveType.PrimitiveTypeName.DOUBLE, "repeat-double")));
    SimpleRecord repeatGroup = new SimpleRecord();
    repeatGroup.values.add(kv("flat-int", 76543));
    repeatGroup.values.add(kv("repeat-double", 1.2345));
    repeatGroup.values.add(kv("repeat-double", 5.6789));
    repeatGroup.values.add(kv("repeat-double", 10.11121314));
    repeatGroup.values.add(kv("repeat-double", 0.4321));
    repeatGroup.values.add(kv("repeat-double", 7.6543));
    simple.values.add(kv("repeat-group", repeatGroup));
    repeatGroup = new SimpleRecord();
    repeatGroup.values.add(kv("flat-int", 12345));
    repeatGroup.values.add(kv("repeat-double", 1.1));
    repeatGroup.values.add(kv("repeat-double", 1.2));
    repeatGroup.values.add(kv("repeat-double", 1.3));
    repeatGroup.values.add(kv("repeat-double", 1.4));
    repeatGroup.values.add(kv("repeat-double", 1.5));
    simple.values.add(kv("repeat-group", repeatGroup));
    repeatGroup = new SimpleRecord();
    repeatGroup.values.add(kv("flat-int", 10293));
    repeatGroup.values.add(kv("repeat-double", 9.5));
    repeatGroup.values.add(kv("repeat-double", 9.4));
    repeatGroup.values.add(kv("repeat-double", 9.3));
    repeatGroup.values.add(kv("repeat-double", 9.2));
    repeatGroup.values.add(kv("repeat-double", 9.1));
    simple.values.add(kv("repeat-group", repeatGroup));
    String actual = JsonRecordFormatter.fromSchema(schema).formatRecord(simple);
    String expected = asJsonString(obj(entry("repeat-group", array(obj(entry("flat-int", 76543), entry("repeat-double", array(1.2345, 5.6789, 10.11121314, 0.4321, 7.6543))), obj(entry("flat-int", 12345), entry("repeat-double", array(1.1, 1.2, 1.3, 1.4, 1.5))), obj(entry("flat-int", 10293), entry("repeat-double", array(9.5, 9.4, 9.3, 9.2, 9.1)))))));
    assertEquals(expected, actual);
}
Also used : GroupType(org.apache.parquet.schema.GroupType) PrimitiveType(org.apache.parquet.schema.PrimitiveType) MessageType(org.apache.parquet.schema.MessageType) Test(org.junit.Test)

Example 64 with PrimitiveType

use of org.apache.parquet.schema.PrimitiveType in project drill by axbaretto.

the class ParquetRecordWriter method getPrimitiveType.

private PrimitiveType getPrimitiveType(MaterializedField field) {
    MinorType minorType = field.getType().getMinorType();
    String name = field.getName();
    PrimitiveTypeName primitiveTypeName = ParquetTypeHelper.getPrimitiveTypeNameForMinorType(minorType);
    Repetition repetition = ParquetTypeHelper.getRepetitionForDataMode(field.getDataMode());
    OriginalType originalType = ParquetTypeHelper.getOriginalTypeForMinorType(minorType);
    DecimalMetadata decimalMetadata = ParquetTypeHelper.getDecimalMetadataForField(field);
    int length = ParquetTypeHelper.getLengthForMinorType(minorType);
    return new PrimitiveType(repetition, primitiveTypeName, length, name, originalType, decimalMetadata, null);
}
Also used : OriginalType(org.apache.parquet.schema.OriginalType) MinorType(org.apache.drill.common.types.TypeProtos.MinorType) PrimitiveType(org.apache.parquet.schema.PrimitiveType) DecimalMetadata(org.apache.parquet.schema.DecimalMetadata) Repetition(org.apache.parquet.schema.Type.Repetition) PrimitiveTypeName(org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName)

Example 65 with PrimitiveType

use of org.apache.parquet.schema.PrimitiveType in project drill by axbaretto.

the class Metadata method getColTypeInfo.

private ColTypeInfo getColTypeInfo(MessageType schema, Type type, String[] path, int depth) {
    if (type.isPrimitive()) {
        PrimitiveType primitiveType = (PrimitiveType) type;
        int precision = 0;
        int scale = 0;
        if (primitiveType.getDecimalMetadata() != null) {
            precision = primitiveType.getDecimalMetadata().getPrecision();
            scale = primitiveType.getDecimalMetadata().getScale();
        }
        int repetitionLevel = schema.getMaxRepetitionLevel(path);
        int definitionLevel = schema.getMaxDefinitionLevel(path);
        return new ColTypeInfo(type.getOriginalType(), precision, scale, repetitionLevel, definitionLevel);
    }
    Type t = ((GroupType) type).getType(path[depth]);
    return getColTypeInfo(schema, t, path, depth + 1);
}
Also used : PrimitiveType(org.apache.parquet.schema.PrimitiveType) GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) OriginalType(org.apache.parquet.schema.OriginalType) GroupType(org.apache.parquet.schema.GroupType) PrimitiveType(org.apache.parquet.schema.PrimitiveType)

Aggregations

PrimitiveType (org.apache.parquet.schema.PrimitiveType)123 Test (org.junit.Test)66 MessageType (org.apache.parquet.schema.MessageType)41 HiveDecimalWritable (org.apache.hadoop.hive.serde2.io.HiveDecimalWritable)28 BooleanWritable (org.apache.hadoop.io.BooleanWritable)28 BytesWritable (org.apache.hadoop.io.BytesWritable)28 DoubleWritable (org.apache.hadoop.io.DoubleWritable)28 FloatWritable (org.apache.hadoop.io.FloatWritable)28 IntWritable (org.apache.hadoop.io.IntWritable)28 LongWritable (org.apache.hadoop.io.LongWritable)28 Writable (org.apache.hadoop.io.Writable)28 GroupType (org.apache.parquet.schema.GroupType)25 Test (org.testng.annotations.Test)20 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)14 OriginalType (org.apache.parquet.schema.OriginalType)14 Type (org.apache.parquet.schema.Type)12 List (java.util.List)11 ColumnIndex (org.apache.parquet.internal.column.columnindex.ColumnIndex)11 ColumnIndexBuilder (org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder)11 ArrayList (java.util.ArrayList)10