Search in sources :

Example 61 with Type

use of org.apache.parquet.schema.Type in project drill by axbaretto.

the class ParquetRecordWriter method getType.

private Type getType(MaterializedField field) {
    MinorType minorType = field.getType().getMinorType();
    DataMode dataMode = field.getType().getMode();
    switch(minorType) {
        case MAP:
            List<Type> types = Lists.newArrayList();
            for (MaterializedField childField : field.getChildren()) {
                types.add(getType(childField));
            }
            return new GroupType(dataMode == DataMode.REPEATED ? Repetition.REPEATED : Repetition.OPTIONAL, field.getName(), types);
        case LIST:
            throw new UnsupportedOperationException("Unsupported type " + minorType);
        default:
            return getPrimitiveType(field);
    }
}
Also used : PrimitiveType(org.apache.parquet.schema.PrimitiveType) GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) MinorType(org.apache.drill.common.types.TypeProtos.MinorType) Type(org.apache.parquet.schema.Type) OriginalType(org.apache.parquet.schema.OriginalType) GroupType(org.apache.parquet.schema.GroupType) DataMode(org.apache.drill.common.types.TypeProtos.DataMode) MinorType(org.apache.drill.common.types.TypeProtos.MinorType) MaterializedField(org.apache.drill.exec.record.MaterializedField)

Example 62 with Type

use of org.apache.parquet.schema.Type in project drill by axbaretto.

the class Metadata method getColTypeInfo.

private ColTypeInfo getColTypeInfo(MessageType schema, Type type, String[] path, int depth) {
    if (type.isPrimitive()) {
        PrimitiveType primitiveType = (PrimitiveType) type;
        int precision = 0;
        int scale = 0;
        if (primitiveType.getDecimalMetadata() != null) {
            precision = primitiveType.getDecimalMetadata().getPrecision();
            scale = primitiveType.getDecimalMetadata().getScale();
        }
        int repetitionLevel = schema.getMaxRepetitionLevel(path);
        int definitionLevel = schema.getMaxDefinitionLevel(path);
        return new ColTypeInfo(type.getOriginalType(), precision, scale, repetitionLevel, definitionLevel);
    }
    Type t = ((GroupType) type).getType(path[depth]);
    return getColTypeInfo(schema, t, path, depth + 1);
}
Also used : PrimitiveType(org.apache.parquet.schema.PrimitiveType) GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) OriginalType(org.apache.parquet.schema.OriginalType) GroupType(org.apache.parquet.schema.GroupType) PrimitiveType(org.apache.parquet.schema.PrimitiveType)

Example 63 with Type

use of org.apache.parquet.schema.Type in project drill by axbaretto.

the class DrillParquetReader method setup.

@Override
public void setup(OperatorContext context, OutputMutator output) throws ExecutionSetupException {
    try {
        this.operatorContext = context;
        schema = footer.getFileMetaData().getSchema();
        MessageType projection;
        if (isStarQuery()) {
            projection = schema;
        } else {
            columnsNotFound = new ArrayList<>();
            projection = getProjection(schema, getColumns(), columnsNotFound);
            if (projection == null) {
                projection = schema;
            }
            if (columnsNotFound != null && columnsNotFound.size() > 0) {
                nullFilledVectors = new ArrayList<>();
                for (SchemaPath col : columnsNotFound) {
                    // col.toExpr() is used here as field name since we don't want to see these fields in the existing maps
                    nullFilledVectors.add((NullableIntVector) output.addField(MaterializedField.create(col.toExpr(), org.apache.drill.common.types.Types.optional(TypeProtos.MinorType.INT)), (Class<? extends ValueVector>) TypeHelper.getValueVectorClass(TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)));
                }
                if (columnsNotFound.size() == getColumns().size()) {
                    noColumnsFound = true;
                }
            }
        }
        logger.debug("Requesting schema {}", projection);
        ColumnIOFactory factory = new ColumnIOFactory(false);
        MessageColumnIO columnIO = factory.getColumnIO(projection, schema);
        Map<ColumnPath, ColumnChunkMetaData> paths = new HashMap<>();
        for (ColumnChunkMetaData md : footer.getBlocks().get(entry.getRowGroupIndex()).getColumns()) {
            paths.put(md.getPath(), md);
        }
        Path filePath = new Path(entry.getPath());
        BlockMetaData blockMetaData = footer.getBlocks().get(entry.getRowGroupIndex());
        recordCount = (int) blockMetaData.getRowCount();
        pageReadStore = new ColumnChunkIncReadStore(recordCount, CodecFactory.createDirectCodecFactory(fileSystem.getConf(), new ParquetDirectByteBufferAllocator(operatorContext.getAllocator()), 0), operatorContext.getAllocator(), fileSystem, filePath);
        for (String[] path : schema.getPaths()) {
            Type type = schema.getType(path);
            if (type.isPrimitive()) {
                ColumnChunkMetaData md = paths.get(ColumnPath.get(path));
                pageReadStore.addColumn(schema.getColumnDescription(path), md);
            }
        }
        if (!noColumnsFound) {
            // Discard the columns not found in the schema when create DrillParquetRecordMaterializer, since they have been added to output already.
            @SuppressWarnings("unchecked") final Collection<SchemaPath> columns = columnsNotFound == null || columnsNotFound.size() == 0 ? getColumns() : CollectionUtils.subtract(getColumns(), columnsNotFound);
            recordMaterializer = new DrillParquetRecordMaterializer(output, projection, columns, fragmentContext.getOptions(), containsCorruptedDates);
            recordReader = columnIO.getRecordReader(pageReadStore, recordMaterializer);
        }
    } catch (Exception e) {
        handleAndRaise("Failure in setting up reader", e);
    }
}
Also used : ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetDirectByteBufferAllocator(org.apache.drill.exec.store.parquet.ParquetDirectByteBufferAllocator) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) HashMap(java.util.HashMap) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) DrillRuntimeException(org.apache.drill.common.exceptions.DrillRuntimeException) OutOfMemoryException(org.apache.drill.exec.exception.OutOfMemoryException) ExecutionSetupException(org.apache.drill.common.exceptions.ExecutionSetupException) IOException(java.io.IOException) ColumnIOFactory(org.apache.parquet.io.ColumnIOFactory) GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) SchemaPath(org.apache.drill.common.expression.SchemaPath) ColumnChunkIncReadStore(org.apache.parquet.hadoop.ColumnChunkIncReadStore) MessageType(org.apache.parquet.schema.MessageType)

Example 64 with Type

use of org.apache.parquet.schema.Type in project drill by axbaretto.

the class DrillParquetReader method getProjection.

public static MessageType getProjection(MessageType schema, Collection<SchemaPath> columns, List<SchemaPath> columnsNotFound) {
    MessageType projection = null;
    String messageName = schema.getName();
    List<ColumnDescriptor> schemaColumns = schema.getColumns();
    // parquet type.union() seems to lose ConvertedType info when merging two columns that are the same type. This can
    // happen when selecting two elements from an array. So to work around this, we use set of SchemaPath to avoid duplicates
    // and then merge the types at the end
    Set<SchemaPath> selectedSchemaPaths = Sets.newLinkedHashSet();
    // get a list of modified columns which have the array elements removed from the schema path since parquet schema doesn't include array elements
    List<SchemaPath> modifiedColumns = Lists.newLinkedList();
    for (SchemaPath path : columns) {
        List<String> segments = Lists.newArrayList();
        PathSegment seg = path.getRootSegment();
        do {
            if (seg.isNamed()) {
                segments.add(seg.getNameSegment().getPath());
            }
        } while ((seg = seg.getChild()) != null);
        String[] pathSegments = new String[segments.size()];
        segments.toArray(pathSegments);
        SchemaPath modifiedSchemaPath = SchemaPath.getCompoundPath(pathSegments);
        modifiedColumns.add(modifiedSchemaPath);
    }
    // convert the columns in the parquet schema to a list of SchemaPath columns so that they can be compared in case insensitive manner
    // to the projection columns
    List<SchemaPath> schemaPaths = Lists.newLinkedList();
    for (ColumnDescriptor columnDescriptor : schemaColumns) {
        String[] schemaColDesc = Arrays.copyOf(columnDescriptor.getPath(), columnDescriptor.getPath().length);
        SchemaPath schemaPath = SchemaPath.getCompoundPath(schemaColDesc);
        schemaPaths.add(schemaPath);
    }
    // loop through projection columns and add any columns that are missing from parquet schema to columnsNotFound list
    for (SchemaPath columnPath : modifiedColumns) {
        boolean notFound = true;
        for (SchemaPath schemaPath : schemaPaths) {
            if (schemaPath.contains(columnPath)) {
                selectedSchemaPaths.add(schemaPath);
                notFound = false;
            }
        }
        if (notFound) {
            columnsNotFound.add(columnPath);
        }
    }
    // convert SchemaPaths from selectedSchemaPaths and convert to parquet type, and merge into projection schema
    for (SchemaPath schemaPath : selectedSchemaPaths) {
        List<String> segments = Lists.newArrayList();
        PathSegment seg = schemaPath.getRootSegment();
        do {
            segments.add(seg.getNameSegment().getPath());
        } while ((seg = seg.getChild()) != null);
        String[] pathSegments = new String[segments.size()];
        segments.toArray(pathSegments);
        Type t = getType(pathSegments, 0, schema);
        if (projection == null) {
            projection = new MessageType(messageName, t);
        } else {
            projection = projection.union(new MessageType(messageName, t));
        }
    }
    return projection;
}
Also used : GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) SchemaPath(org.apache.drill.common.expression.SchemaPath) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) PathSegment(org.apache.drill.common.expression.PathSegment) MessageType(org.apache.parquet.schema.MessageType)

Example 65 with Type

use of org.apache.parquet.schema.Type in project drill by apache.

the class DrillParquetReader method adaptColumnsToParquetSchema.

/**
 * This method adjusts collection of SchemaPath projection columns to better match columns in given
 * schema. It does few things to reach the goal:
 * <ul>
 *   <li>skips ArraySegments if present;</li>
 *   <li>interrupts further projections for Parquet MAPs to allow EvaluationVisitor manage get by key logic;</li>
 *   <li>adds additional listName and elementName for logical lists, because they exists in schema but absent in original projection columns.</li>
 * </ul>
 *
 * @param columns original projection columns
 * @param schema Parquet file schema
 * @return adjusted projection columns
 */
private static List<SchemaPath> adaptColumnsToParquetSchema(Collection<SchemaPath> columns, MessageType schema) {
    List<SchemaPath> modifiedColumns = new LinkedList<>();
    for (SchemaPath path : columns) {
        List<String> segments = new ArrayList<>();
        Type segmentType = schema;
        for (PathSegment seg = path.getRootSegment(); seg != null; seg = seg.getChild()) {
            if (seg.isNamed()) {
                segments.add(seg.getNameSegment().getPath());
            }
            segmentType = getSegmentType(segmentType, seg);
            if (segmentType != null && !segmentType.isPrimitive()) {
                GroupType segGroupType = segmentType.asGroupType();
                if (ParquetReaderUtility.isLogicalMapType(segGroupType)) {
                    // later as values obtained from dict by key differ from the actual column's path
                    break;
                } else if (ParquetReaderUtility.isLogicalListType(segGroupType)) {
                    // 'list' or 'bag'
                    String listName = segGroupType.getType(0).getName();
                    // 'element' or 'array_element'
                    String elementName = segGroupType.getType(0).asGroupType().getType(0).getName();
                    segments.add(listName);
                    segments.add(elementName);
                }
            }
        }
        modifiedColumns.add(SchemaPath.getCompoundPath(segments.toArray(new String[0])));
    }
    return modifiedColumns;
}
Also used : GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) GroupType(org.apache.parquet.schema.GroupType) SchemaPath(org.apache.drill.common.expression.SchemaPath) ArrayList(java.util.ArrayList) PathSegment(org.apache.drill.common.expression.PathSegment) LinkedList(java.util.LinkedList)

Aggregations

Type (org.apache.parquet.schema.Type)78 GroupType (org.apache.parquet.schema.GroupType)67 MessageType (org.apache.parquet.schema.MessageType)62 OriginalType (org.apache.parquet.schema.OriginalType)39 PrimitiveType (org.apache.parquet.schema.PrimitiveType)34 ArrayList (java.util.ArrayList)24 SchemaPath (org.apache.drill.common.expression.SchemaPath)10 HashMap (java.util.HashMap)9 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)9 PathSegment (org.apache.drill.common.expression.PathSegment)8 Converter (org.apache.parquet.io.api.Converter)6 GroupConverter (org.apache.parquet.io.api.GroupConverter)6 MinorType (org.apache.drill.common.types.TypeProtos.MinorType)5 MaterializedField (org.apache.drill.exec.record.MaterializedField)5 Collection (java.util.Collection)4 List (java.util.List)4 Function (java.util.function.Function)4 LogicalType (org.apache.avro.LogicalType)4 DrillRuntimeException (org.apache.drill.common.exceptions.DrillRuntimeException)4 ExecConstants (org.apache.drill.exec.ExecConstants)4