Search in sources :

Example 56 with Type

use of org.apache.parquet.schema.Type in project parquet-mr by apache.

the class DataWritableReadSupport method resolveSchemaAccess.

/**
 * Determine the file column names based on the position within the requested columns and
 * use that as the requested schema.
 */
private MessageType resolveSchemaAccess(MessageType requestedSchema, MessageType fileSchema, Configuration configuration) {
    if (configuration.getBoolean(PARQUET_COLUMN_INDEX_ACCESS, false)) {
        final List<String> listColumns = getColumns(configuration.get(IOConstants.COLUMNS));
        List<Type> requestedTypes = new ArrayList<Type>();
        for (Type t : requestedSchema.getFields()) {
            int index = listColumns.indexOf(t.getName());
            requestedTypes.add(fileSchema.getType(index));
        }
        requestedSchema = new MessageType(requestedSchema.getName(), requestedTypes);
    }
    return requestedSchema;
}
Also used : PrimitiveType(org.apache.parquet.schema.PrimitiveType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) ArrayList(java.util.ArrayList) MessageType(org.apache.parquet.schema.MessageType)

Example 57 with Type

use of org.apache.parquet.schema.Type in project parquet-mr by apache.

the class AvroSchemaConverter method convertUnion.

private Type convertUnion(String fieldName, Schema schema, Type.Repetition repetition) {
    List<Schema> nonNullSchemas = new ArrayList<Schema>(schema.getTypes().size());
    for (Schema childSchema : schema.getTypes()) {
        if (childSchema.getType().equals(Schema.Type.NULL)) {
            if (Type.Repetition.REQUIRED == repetition) {
                repetition = Type.Repetition.OPTIONAL;
            }
        } else {
            nonNullSchemas.add(childSchema);
        }
    }
    // otherwise construct a union container
    switch(nonNullSchemas.size()) {
        case 0:
            throw new UnsupportedOperationException("Cannot convert Avro union of only nulls");
        case 1:
            return convertField(fieldName, nonNullSchemas.get(0), repetition);
        default:
            // complex union type
            List<Type> unionTypes = new ArrayList<Type>(nonNullSchemas.size());
            int index = 0;
            for (Schema childSchema : nonNullSchemas) {
                unionTypes.add(convertField("member" + index++, childSchema, Type.Repetition.OPTIONAL));
            }
            return new GroupType(repetition, fieldName, unionTypes);
    }
}
Also used : PrimitiveType(org.apache.parquet.schema.PrimitiveType) GroupType(org.apache.parquet.schema.GroupType) LogicalType(org.apache.avro.LogicalType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) OriginalType(org.apache.parquet.schema.OriginalType) GroupType(org.apache.parquet.schema.GroupType) Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList)

Example 58 with Type

use of org.apache.parquet.schema.Type in project parquet-mr by apache.

the class TestPigSchemaConverter method testListsOfPrimitive.

@Test
public void testListsOfPrimitive() throws Exception {
    for (Type.Repetition repetition : Type.Repetition.values()) {
        for (Type.Repetition valueRepetition : Type.Repetition.values()) {
            for (PrimitiveType.PrimitiveTypeName primitiveTypeName : PrimitiveType.PrimitiveTypeName.values()) {
                if (primitiveTypeName != PrimitiveType.PrimitiveTypeName.INT96) {
                    // INT96 is NYI
                    Types.PrimitiveBuilder<PrimitiveType> value = Types.primitive(primitiveTypeName, valueRepetition);
                    if (primitiveTypeName == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY)
                        value.length(1);
                    GroupType type = Types.buildGroup(repetition).addField(value.named("b")).as(OriginalType.LIST).named("a");
                    // no exceptions, please
                    pigSchemaConverter.convertField(type);
                }
            }
        }
    }
}
Also used : Types(org.apache.parquet.schema.Types) PrimitiveType(org.apache.parquet.schema.PrimitiveType) GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) OriginalType(org.apache.parquet.schema.OriginalType) GroupType(org.apache.parquet.schema.GroupType) PrimitiveType(org.apache.parquet.schema.PrimitiveType) Test(org.junit.Test)

Example 59 with Type

use of org.apache.parquet.schema.Type in project parquet-mr by apache.

the class GroupWriter method writeGroup.

private void writeGroup(Group group, GroupType type) {
    int fieldCount = type.getFieldCount();
    for (int field = 0; field < fieldCount; ++field) {
        int valueCount = group.getFieldRepetitionCount(field);
        if (valueCount > 0) {
            Type fieldType = type.getType(field);
            String fieldName = fieldType.getName();
            recordConsumer.startField(fieldName, field);
            for (int index = 0; index < valueCount; ++index) {
                if (fieldType.isPrimitive()) {
                    group.writeValue(field, index, recordConsumer);
                } else {
                    recordConsumer.startGroup();
                    writeGroup(group.getGroup(field, index), fieldType.asGroupType());
                    recordConsumer.endGroup();
                }
            }
            recordConsumer.endField(fieldName, field);
        }
    }
}
Also used : GroupType(org.apache.parquet.schema.GroupType) Type(org.apache.parquet.schema.Type)

Example 60 with Type

use of org.apache.parquet.schema.Type in project drill by axbaretto.

the class ParquetRecordWriter method newSchema.

private void newSchema() throws IOException {
    List<Type> types = Lists.newArrayList();
    for (MaterializedField field : batchSchema) {
        if (field.getName().equalsIgnoreCase(WriterPrel.PARTITION_COMPARATOR_FIELD)) {
            continue;
        }
        types.add(getType(field));
    }
    schema = new MessageType("root", types);
    // We don't want this number to be too small, ideally we divide the block equally across the columns.
    // It is unlikely all columns are going to be the same size.
    // Its value is likely below Integer.MAX_VALUE (2GB), although rowGroupSize is a long type.
    // Therefore this size is cast to int, since allocating byte array in under layer needs to
    // limit the array size in an int scope.
    int initialBlockBufferSize = max(MINIMUM_BUFFER_SIZE, blockSize / this.schema.getColumns().size() / 5);
    // We don't want this number to be too small either. Ideally, slightly bigger than the page size,
    // but not bigger than the block buffer
    int initialPageBufferSize = max(MINIMUM_BUFFER_SIZE, min(pageSize + pageSize / 10, initialBlockBufferSize));
    // TODO: Use initialSlabSize from ParquetProperties once drill will be updated to the latest version of Parquet library
    int initialSlabSize = CapacityByteArrayOutputStream.initialSlabSizeHeuristic(64, pageSize, 10);
    // TODO: Replace ParquetColumnChunkPageWriteStore with ColumnChunkPageWriteStore from parquet library
    // once PARQUET-1006 will be resolved
    pageStore = new ParquetColumnChunkPageWriteStore(codecFactory.getCompressor(codec), schema, initialSlabSize, pageSize, new ParquetDirectByteBufferAllocator(oContext));
    store = new ColumnWriteStoreV1(pageStore, pageSize, initialPageBufferSize, enableDictionary, writerVersion, new ParquetDirectByteBufferAllocator(oContext));
    MessageColumnIO columnIO = new ColumnIOFactory(false).getColumnIO(this.schema);
    consumer = columnIO.getRecordWriter(store);
    setUp(schema, consumer);
}
Also used : PrimitiveType(org.apache.parquet.schema.PrimitiveType) GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) MinorType(org.apache.drill.common.types.TypeProtos.MinorType) Type(org.apache.parquet.schema.Type) OriginalType(org.apache.parquet.schema.OriginalType) ParquetColumnChunkPageWriteStore(org.apache.parquet.hadoop.ParquetColumnChunkPageWriteStore) ColumnWriteStoreV1(org.apache.parquet.column.impl.ColumnWriteStoreV1) MaterializedField(org.apache.drill.exec.record.MaterializedField) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) MessageType(org.apache.parquet.schema.MessageType) ColumnIOFactory(org.apache.parquet.io.ColumnIOFactory)

Aggregations

Type (org.apache.parquet.schema.Type)78 GroupType (org.apache.parquet.schema.GroupType)67 MessageType (org.apache.parquet.schema.MessageType)62 OriginalType (org.apache.parquet.schema.OriginalType)39 PrimitiveType (org.apache.parquet.schema.PrimitiveType)34 ArrayList (java.util.ArrayList)24 SchemaPath (org.apache.drill.common.expression.SchemaPath)10 HashMap (java.util.HashMap)9 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)9 PathSegment (org.apache.drill.common.expression.PathSegment)8 Converter (org.apache.parquet.io.api.Converter)6 GroupConverter (org.apache.parquet.io.api.GroupConverter)6 MinorType (org.apache.drill.common.types.TypeProtos.MinorType)5 MaterializedField (org.apache.drill.exec.record.MaterializedField)5 Collection (java.util.Collection)4 List (java.util.List)4 Function (java.util.function.Function)4 LogicalType (org.apache.avro.LogicalType)4 DrillRuntimeException (org.apache.drill.common.exceptions.DrillRuntimeException)4 ExecConstants (org.apache.drill.exec.ExecConstants)4