Search in sources :

Example 36 with Type

use of org.apache.parquet.schema.Type in project presto by prestodb.

the class SingleLevelArraySchemaConverter method convertMapType.

// An optional group containing a repeated anonymous group "map", containing
// 2 elements: "key", "value"
private static GroupType convertMapType(final String name, final MapTypeInfo typeInfo, final Repetition repetition) {
    final Type keyType = convertType(ParquetHiveSerDe.MAP_KEY.toString(), typeInfo.getMapKeyTypeInfo(), Repetition.REQUIRED);
    final Type valueType = convertType(ParquetHiveSerDe.MAP_VALUE.toString(), typeInfo.getMapValueTypeInfo());
    return ConversionPatterns.mapType(repetition, name, keyType, valueType);
}
Also used : GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) OriginalType(org.apache.parquet.schema.OriginalType)

Example 37 with Type

use of org.apache.parquet.schema.Type in project presto by prestodb.

the class ParquetTypeVisitor method visit.

public static <T> T visit(Type type, ParquetTypeVisitor<T> visitor) {
    if (type instanceof MessageType) {
        return visitor.message((MessageType) type, visitFields(type.asGroupType(), visitor));
    } else if (type.isPrimitive()) {
        return visitor.primitive(type.asPrimitiveType());
    } else {
        // if not a primitive, the typeId must be a group
        GroupType group = type.asGroupType();
        OriginalType annotation = group.getOriginalType();
        if (annotation == LIST) {
            checkArgument(!group.isRepetition(REPEATED), "Invalid list: top-level group is repeated: " + group);
            checkArgument(group.getFieldCount() == 1, "Invalid list: does not contain single repeated field: " + group);
            GroupType repeatedElement = group.getFields().get(0).asGroupType();
            checkArgument(repeatedElement.isRepetition(REPEATED), "Invalid list: inner group is not repeated");
            checkArgument(repeatedElement.getFieldCount() <= 1, "Invalid list: repeated group is not a single field: " + group);
            visitor.fieldNames.push(repeatedElement.getName());
            try {
                T elementResult = null;
                if (repeatedElement.getFieldCount() > 0) {
                    elementResult = visitField(repeatedElement.getType(0), visitor);
                }
                return visitor.list(group, elementResult);
            } finally {
                visitor.fieldNames.pop();
            }
        } else if (annotation == MAP) {
            checkArgument(!group.isRepetition(REPEATED), "Invalid map: top-level group is repeated: " + group);
            checkArgument(group.getFieldCount() == 1, "Invalid map: does not contain single repeated field: " + group);
            GroupType repeatedKeyValue = group.getType(0).asGroupType();
            checkArgument(repeatedKeyValue.isRepetition(REPEATED), "Invalid map: inner group is not repeated");
            checkArgument(repeatedKeyValue.getFieldCount() <= 2, "Invalid map: repeated group does not have 2 fields");
            visitor.fieldNames.push(repeatedKeyValue.getName());
            try {
                T keyResult = null;
                T valueResult = null;
                if (repeatedKeyValue.getFieldCount() == 2) {
                    keyResult = visitField(repeatedKeyValue.getType(0), visitor);
                    valueResult = visitField(repeatedKeyValue.getType(1), visitor);
                } else if (repeatedKeyValue.getFieldCount() == 1) {
                    Type keyOrValue = repeatedKeyValue.getType(0);
                    if (keyOrValue.getName().equalsIgnoreCase("key")) {
                        keyResult = visitField(keyOrValue, visitor);
                    // value result remains null
                    } else {
                        valueResult = visitField(keyOrValue, visitor);
                    // key result remains null
                    }
                }
                return visitor.map(group, keyResult, valueResult);
            } finally {
                visitor.fieldNames.pop();
            }
        }
        return visitor.struct(group, visitFields(group, visitor));
    }
}
Also used : OriginalType(org.apache.parquet.schema.OriginalType) PrimitiveType(org.apache.parquet.schema.PrimitiveType) MessageType(org.apache.parquet.schema.MessageType) GroupType(org.apache.parquet.schema.GroupType) Type(org.apache.parquet.schema.Type) OriginalType(org.apache.parquet.schema.OriginalType) GroupType(org.apache.parquet.schema.GroupType) LIST(org.apache.parquet.schema.OriginalType.LIST) MessageType(org.apache.parquet.schema.MessageType)

Example 38 with Type

use of org.apache.parquet.schema.Type in project flink by apache.

the class ParquetVectorizedInputFormat method checkSchema.

private void checkSchema(MessageType fileSchema, MessageType requestedSchema) throws IOException, UnsupportedOperationException {
    if (projectedFields.length != requestedSchema.getFieldCount()) {
        throw new RuntimeException("The quality of field type is incompatible with the request schema!");
    }
    /*
         * Check that the requested schema is supported.
         */
    for (int i = 0; i < requestedSchema.getFieldCount(); ++i) {
        Type t = requestedSchema.getFields().get(i);
        if (!t.isPrimitive() || t.isRepetition(Type.Repetition.REPEATED)) {
            throw new UnsupportedOperationException("Complex types not supported.");
        }
        String[] colPath = requestedSchema.getPaths().get(i);
        if (fileSchema.containsPath(colPath)) {
            ColumnDescriptor fd = fileSchema.getColumnDescription(colPath);
            if (!fd.equals(requestedSchema.getColumns().get(i))) {
                throw new UnsupportedOperationException("Schema evolution not supported.");
            }
        } else {
            if (requestedSchema.getColumns().get(i).getMaxDefinitionLevel() == 0) {
                // invalid.
                throw new IOException("Required column is missing in data file. Col: " + Arrays.toString(colPath));
            }
        }
    }
}
Also used : RowType(org.apache.flink.table.types.logical.RowType) GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) LogicalType(org.apache.flink.table.types.logical.LogicalType) Type(org.apache.parquet.schema.Type) FlinkRuntimeException(org.apache.flink.util.FlinkRuntimeException) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) IOException(java.io.IOException)

Example 39 with Type

use of org.apache.parquet.schema.Type in project hive by apache.

the class ParquetFilterPredicateConverter method translate.

private static FilterPredicate translate(ExpressionTree root, List<PredicateLeaf> leaves, Map<String, TypeInfo> columns, MessageType schema) throws Exception {
    FilterPredicate p = null;
    switch(root.getOperator()) {
        case OR:
            for (ExpressionTree child : root.getChildren()) {
                FilterPredicate childPredicate = translate(child, leaves, columns, schema);
                if (childPredicate == null) {
                    return null;
                }
                if (p == null) {
                    p = childPredicate;
                } else {
                    p = FilterApi.or(p, childPredicate);
                }
            }
            return p;
        case AND:
            for (ExpressionTree child : root.getChildren()) {
                if (p == null) {
                    p = translate(child, leaves, columns, schema);
                } else {
                    FilterPredicate right = translate(child, leaves, columns, schema);
                    // constant means no filter, ignore it when it is null
                    if (right != null) {
                        p = FilterApi.and(p, right);
                    }
                }
            }
            return p;
        case NOT:
            FilterPredicate op = translate(root.getChildren().get(0), leaves, columns, schema);
            if (op != null) {
                return FilterApi.not(op);
            } else {
                return null;
            }
        case LEAF:
            PredicateLeaf leaf = leaves.get(root.getLeaf());
            // If columns is null, then we need to create the leaf
            if (columns.containsKey(leaf.getColumnName())) {
                Type parquetType = schema.getType(leaf.getColumnName());
                TypeInfo hiveType = columns.get(leaf.getColumnName());
                return buildFilterPredicateFromPredicateLeaf(leaf, parquetType, hiveType);
            } else {
                // Do not create predicate if the leaf is not on the passed schema.
                return null;
            }
        case CONSTANT:
            // no filter will be executed for constant
            return null;
        default:
            throw new IllegalStateException("Unknown operator: " + root.getOperator());
    }
}
Also used : MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) PredicateLeaf(org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf) ExpressionTree(org.apache.hadoop.hive.ql.io.sarg.ExpressionTree) FilterPredicate(org.apache.parquet.filter2.predicate.FilterPredicate) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo)

Example 40 with Type

use of org.apache.parquet.schema.Type in project hive by apache.

the class DataWritableReadSupport method getProjectedSchema.

/**
 * Generate the projected schema from colIndexes and nested column paths. If the column is
 * contained by colIndex, it will be added directly, otherwise it will build a group type which
 * contains all required sub types using nestedColumnPaths.
 * @param schema original schema
 * @param colNames
 * @param colIndexes the index of needed columns
 * @param nestedColumnPaths the paths for nested columns
 * @return
 */
public static MessageType getProjectedSchema(MessageType schema, List<String> colNames, List<Integer> colIndexes, Set<String> nestedColumnPaths) {
    List<Type> schemaTypes = new ArrayList<Type>();
    Map<String, FieldNode> prunedCols = getPrunedNestedColumns(nestedColumnPaths);
    for (Integer i : colIndexes) {
        if (i < colNames.size()) {
            if (i < schema.getFieldCount()) {
                Type t = schema.getType(i);
                String tn = t.getName().toLowerCase();
                if (!prunedCols.containsKey(tn)) {
                    schemaTypes.add(schema.getType(i));
                } else {
                    if (t.isPrimitive()) {
                        // For primitive type, add directly.
                        schemaTypes.add(t);
                    } else {
                        // For group type, we need to build the projected group type with required leaves
                        List<Type> g = projectLeafTypes(Arrays.asList(t), Arrays.asList(prunedCols.get(tn)));
                        if (!g.isEmpty()) {
                            schemaTypes.addAll(g);
                        }
                    }
                }
            } else {
                // prefixing with '_mask_' to ensure no conflict with named
                // columns in the file schema
                schemaTypes.add(Types.optional(PrimitiveTypeName.BINARY).named("_mask_" + colNames.get(i)));
            }
        }
    }
    return new MessageType(schema.getName(), schemaTypes);
}
Also used : GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) FieldNode(org.apache.hadoop.hive.ql.optimizer.FieldNode) ArrayList(java.util.ArrayList) MessageType(org.apache.parquet.schema.MessageType)

Aggregations

Type (org.apache.parquet.schema.Type)88 MessageType (org.apache.parquet.schema.MessageType)72 GroupType (org.apache.parquet.schema.GroupType)69 OriginalType (org.apache.parquet.schema.OriginalType)35 PrimitiveType (org.apache.parquet.schema.PrimitiveType)35 ArrayList (java.util.ArrayList)25 HashMap (java.util.HashMap)10 SchemaPath (org.apache.drill.common.expression.SchemaPath)10 TypeInfo (org.apache.hadoop.hive.serde2.typeinfo.TypeInfo)10 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)10 PathSegment (org.apache.drill.common.expression.PathSegment)8 Converter (org.apache.parquet.io.api.Converter)6 GroupConverter (org.apache.parquet.io.api.GroupConverter)6 MinorType (org.apache.drill.common.types.TypeProtos.MinorType)5 MaterializedField (org.apache.drill.exec.record.MaterializedField)5 LogicalTypeAnnotation (org.apache.parquet.schema.LogicalTypeAnnotation)5 Collection (java.util.Collection)4 List (java.util.List)4 Function (java.util.function.Function)4 LogicalType (org.apache.avro.LogicalType)4