Search in sources :

Example 81 with Type

use of org.apache.parquet.schema.Type in project druid by druid-io.

the class ParquetGroupConverter method convertField.

/**
 * See {@link ParquetGroupConverter#convertField(Group, String)}
 */
@Nullable
private static Object convertField(Group g, String fieldName, boolean binaryAsString) {
    if (!g.getType().containsField(fieldName)) {
        return null;
    }
    final int fieldIndex = g.getType().getFieldIndex(fieldName);
    if (g.getFieldRepetitionCount(fieldIndex) <= 0) {
        return null;
    }
    Type fieldType = g.getType().getFields().get(fieldIndex);
    // primitive field
    if (fieldType.isPrimitive()) {
        // primitive list
        if (fieldType.getRepetition().equals(Type.Repetition.REPEATED)) {
            int repeated = g.getFieldRepetitionCount(fieldIndex);
            List<Object> vals = new ArrayList<>();
            for (int i = 0; i < repeated; i++) {
                vals.add(convertPrimitiveField(g, fieldIndex, i, binaryAsString));
            }
            return vals;
        }
        return convertPrimitiveField(g, fieldIndex, binaryAsString);
    } else {
        if (fieldType.isRepetition(Type.Repetition.REPEATED)) {
            return convertRepeatedFieldToList(g, fieldIndex, binaryAsString);
        }
        if (isLogicalMapType(fieldType)) {
            return convertLogicalMap(g.getGroup(fieldIndex, 0), binaryAsString);
        }
        if (isLogicalListType(fieldType)) {
            return convertLogicalList(g.getGroup(fieldIndex, 0), binaryAsString);
        }
        // not a list, but not a primitive, return the nested group type
        return g.getGroup(fieldIndex, 0);
    }
}
Also used : PrimitiveType(org.apache.parquet.schema.PrimitiveType) GroupType(org.apache.parquet.schema.GroupType) Type(org.apache.parquet.schema.Type) OriginalType(org.apache.parquet.schema.OriginalType) ArrayList(java.util.ArrayList) Nullable(javax.annotation.Nullable)

Example 82 with Type

use of org.apache.parquet.schema.Type in project druid by druid-io.

the class ParquetGroupConverter method convertRepeatedFieldToList.

/**
 * convert a repeated field into a list of primitives or groups
 */
private static List<Object> convertRepeatedFieldToList(Group g, int fieldIndex, boolean binaryAsString) {
    Type t = g.getType().getFields().get(fieldIndex);
    assert t.getRepetition().equals(Type.Repetition.REPEATED);
    int repeated = g.getFieldRepetitionCount(fieldIndex);
    List<Object> vals = new ArrayList<>();
    for (int i = 0; i < repeated; i++) {
        if (t.isPrimitive()) {
            vals.add(convertPrimitiveField(g, fieldIndex, i, binaryAsString));
        } else {
            vals.add(g.getGroup(fieldIndex, i));
        }
    }
    return vals;
}
Also used : PrimitiveType(org.apache.parquet.schema.PrimitiveType) GroupType(org.apache.parquet.schema.GroupType) Type(org.apache.parquet.schema.Type) OriginalType(org.apache.parquet.schema.OriginalType) ArrayList(java.util.ArrayList)

Example 83 with Type

use of org.apache.parquet.schema.Type in project drill by apache.

the class DrillParquetReader method adaptColumnsToParquetSchema.

/**
 * This method adjusts collection of SchemaPath projection columns to better match columns in given
 * schema. It does few things to reach the goal:
 * <ul>
 *   <li>skips ArraySegments if present;</li>
 *   <li>interrupts further projections for Parquet MAPs to allow EvaluationVisitor manage get by key logic;</li>
 *   <li>adds additional listName and elementName for logical lists, because they exists in schema but absent in original projection columns.</li>
 * </ul>
 *
 * @param columns original projection columns
 * @param schema Parquet file schema
 * @return adjusted projection columns
 */
private static List<SchemaPath> adaptColumnsToParquetSchema(Collection<SchemaPath> columns, MessageType schema) {
    List<SchemaPath> modifiedColumns = new LinkedList<>();
    for (SchemaPath path : columns) {
        List<String> segments = new ArrayList<>();
        Type segmentType = schema;
        for (PathSegment seg = path.getRootSegment(); seg != null; seg = seg.getChild()) {
            if (seg.isNamed()) {
                segments.add(seg.getNameSegment().getPath());
            }
            segmentType = getSegmentType(segmentType, seg);
            if (segmentType != null && !segmentType.isPrimitive()) {
                GroupType segGroupType = segmentType.asGroupType();
                if (ParquetReaderUtility.isLogicalMapType(segGroupType)) {
                    // later as values obtained from dict by key differ from the actual column's path
                    break;
                } else if (ParquetReaderUtility.isLogicalListType(segGroupType)) {
                    // 'list' or 'bag'
                    String listName = segGroupType.getType(0).getName();
                    // 'element' or 'array_element'
                    String elementName = segGroupType.getType(0).asGroupType().getType(0).getName();
                    segments.add(listName);
                    segments.add(elementName);
                }
            }
        }
        modifiedColumns.add(SchemaPath.getCompoundPath(segments.toArray(new String[0])));
    }
    return modifiedColumns;
}
Also used : GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) GroupType(org.apache.parquet.schema.GroupType) SchemaPath(org.apache.drill.common.expression.SchemaPath) ArrayList(java.util.ArrayList) PathSegment(org.apache.drill.common.expression.PathSegment) LinkedList(java.util.LinkedList)

Example 84 with Type

use of org.apache.parquet.schema.Type in project drill by apache.

the class DrillParquetReader method getSegmentType.

private static Type getSegmentType(String[] pathSegments, int depth, MessageType schema) {
    int nextDepth = depth + 1;
    Type type = schema.getType(Arrays.copyOfRange(pathSegments, 0, nextDepth));
    if (nextDepth == pathSegments.length) {
        return type;
    } else {
        Preconditions.checkState(!type.isPrimitive());
        return Types.buildGroup(type.getRepetition()).as(type.getOriginalType()).addField(getSegmentType(pathSegments, nextDepth, schema)).named(type.getName());
    }
}
Also used : GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type)

Example 85 with Type

use of org.apache.parquet.schema.Type in project drill by apache.

the class DrillParquetReader method getSegmentType.

/**
 * Get type from the supplied {@code type} corresponding to given {@code segment}.
 *
 * @param parentSegmentType type to extract field corresponding to segment
 * @param segment segment which type will be returned
 * @return type corresponding to the {@code segment} or {@code null} if there is no field found in {@code type}.
 */
private static Type getSegmentType(Type parentSegmentType, PathSegment segment) {
    Type segmentType = null;
    if (parentSegmentType != null && !parentSegmentType.isPrimitive()) {
        GroupType groupType = parentSegmentType.asGroupType();
        if (segment.isNamed()) {
            String fieldName = segment.getNameSegment().getPath();
            segmentType = groupType.getFields().stream().filter(f -> f.getName().equalsIgnoreCase(fieldName)).findAny().map(field -> groupType.getType(field.getName())).orElse(null);
        } else if (ParquetReaderUtility.isLogicalListType(parentSegmentType.asGroupType())) {
            // the segment is array index
            // get element type of the list
            segmentType = groupType.getType(0).asGroupType().getType(0);
        }
    }
    return segmentType;
}
Also used : GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) GroupType(org.apache.parquet.schema.GroupType)

Aggregations

Type (org.apache.parquet.schema.Type)88 MessageType (org.apache.parquet.schema.MessageType)72 GroupType (org.apache.parquet.schema.GroupType)69 OriginalType (org.apache.parquet.schema.OriginalType)35 PrimitiveType (org.apache.parquet.schema.PrimitiveType)35 ArrayList (java.util.ArrayList)25 HashMap (java.util.HashMap)10 SchemaPath (org.apache.drill.common.expression.SchemaPath)10 TypeInfo (org.apache.hadoop.hive.serde2.typeinfo.TypeInfo)10 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)10 PathSegment (org.apache.drill.common.expression.PathSegment)8 Converter (org.apache.parquet.io.api.Converter)6 GroupConverter (org.apache.parquet.io.api.GroupConverter)6 MinorType (org.apache.drill.common.types.TypeProtos.MinorType)5 MaterializedField (org.apache.drill.exec.record.MaterializedField)5 LogicalTypeAnnotation (org.apache.parquet.schema.LogicalTypeAnnotation)5 Collection (java.util.Collection)4 List (java.util.List)4 Function (java.util.function.Function)4 LogicalType (org.apache.avro.LogicalType)4