Search in sources :

Example 31 with PathSegment

use of org.apache.drill.common.expression.PathSegment in project drill by axbaretto.

the class FieldPathHelper method schemaPathToFieldPath.

/**
 * Returns {@link FieldPath} equivalent of the specified {@link SchemaPath}.
 *
 * @param schemaPath {@link SchemaPath} instance that should be converted
 * @return {@link FieldPath} equivalent of the specified {@link SchemaPath}.
 */
public static FieldPath schemaPathToFieldPath(SchemaPath schemaPath) {
    Deque<PathSegment> pathSegments = Queues.newArrayDeque();
    PathSegment pathSegment = schemaPath.getRootSegment();
    while (pathSegment != null) {
        pathSegments.push(pathSegment);
        pathSegment = pathSegment.getChild();
    }
    FieldSegment child = null;
    while (!pathSegments.isEmpty()) {
        pathSegment = pathSegments.pop();
        if (pathSegment.isNamed()) {
            child = new FieldSegment.NameSegment(((PathSegment.NameSegment) pathSegment).getPath(), child, false);
        } else {
            child = new FieldSegment.IndexSegment(String.valueOf(((PathSegment.ArraySegment) pathSegment).getIndex()), child);
        }
    }
    return new FieldPath((FieldSegment.NameSegment) child);
}
Also used : FieldSegment(org.ojai.FieldSegment) FieldPath(org.ojai.FieldPath) PathSegment(org.apache.drill.common.expression.PathSegment)

Example 32 with PathSegment

use of org.apache.drill.common.expression.PathSegment in project drill by axbaretto.

the class DrillParquetReader method getProjection.

public static MessageType getProjection(MessageType schema, Collection<SchemaPath> columns, List<SchemaPath> columnsNotFound) {
    MessageType projection = null;
    String messageName = schema.getName();
    List<ColumnDescriptor> schemaColumns = schema.getColumns();
    // parquet type.union() seems to lose ConvertedType info when merging two columns that are the same type. This can
    // happen when selecting two elements from an array. So to work around this, we use set of SchemaPath to avoid duplicates
    // and then merge the types at the end
    Set<SchemaPath> selectedSchemaPaths = Sets.newLinkedHashSet();
    // get a list of modified columns which have the array elements removed from the schema path since parquet schema doesn't include array elements
    List<SchemaPath> modifiedColumns = Lists.newLinkedList();
    for (SchemaPath path : columns) {
        List<String> segments = Lists.newArrayList();
        PathSegment seg = path.getRootSegment();
        do {
            if (seg.isNamed()) {
                segments.add(seg.getNameSegment().getPath());
            }
        } while ((seg = seg.getChild()) != null);
        String[] pathSegments = new String[segments.size()];
        segments.toArray(pathSegments);
        SchemaPath modifiedSchemaPath = SchemaPath.getCompoundPath(pathSegments);
        modifiedColumns.add(modifiedSchemaPath);
    }
    // convert the columns in the parquet schema to a list of SchemaPath columns so that they can be compared in case insensitive manner
    // to the projection columns
    List<SchemaPath> schemaPaths = Lists.newLinkedList();
    for (ColumnDescriptor columnDescriptor : schemaColumns) {
        String[] schemaColDesc = Arrays.copyOf(columnDescriptor.getPath(), columnDescriptor.getPath().length);
        SchemaPath schemaPath = SchemaPath.getCompoundPath(schemaColDesc);
        schemaPaths.add(schemaPath);
    }
    // loop through projection columns and add any columns that are missing from parquet schema to columnsNotFound list
    for (SchemaPath columnPath : modifiedColumns) {
        boolean notFound = true;
        for (SchemaPath schemaPath : schemaPaths) {
            if (schemaPath.contains(columnPath)) {
                selectedSchemaPaths.add(schemaPath);
                notFound = false;
            }
        }
        if (notFound) {
            columnsNotFound.add(columnPath);
        }
    }
    // convert SchemaPaths from selectedSchemaPaths and convert to parquet type, and merge into projection schema
    for (SchemaPath schemaPath : selectedSchemaPaths) {
        List<String> segments = Lists.newArrayList();
        PathSegment seg = schemaPath.getRootSegment();
        do {
            segments.add(seg.getNameSegment().getPath());
        } while ((seg = seg.getChild()) != null);
        String[] pathSegments = new String[segments.size()];
        segments.toArray(pathSegments);
        Type t = getType(pathSegments, 0, schema);
        if (projection == null) {
            projection = new MessageType(messageName, t);
        } else {
            projection = projection.union(new MessageType(messageName, t));
        }
    }
    return projection;
}
Also used : GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) SchemaPath(org.apache.drill.common.expression.SchemaPath) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) PathSegment(org.apache.drill.common.expression.PathSegment) MessageType(org.apache.parquet.schema.MessageType)

Aggregations

PathSegment (org.apache.drill.common.expression.PathSegment)32 SchemaPath (org.apache.drill.common.expression.SchemaPath)14 BaseWriter (org.apache.drill.exec.vector.complex.writer.BaseWriter)6 FieldSegment (org.ojai.FieldSegment)6 ArrayList (java.util.ArrayList)5 MajorType (org.apache.drill.common.types.TypeProtos.MajorType)5 GroupType (org.apache.parquet.schema.GroupType)5 Type (org.apache.parquet.schema.Type)5 FieldPath (org.ojai.FieldPath)5 Stack (java.util.Stack)4 RexNode (org.apache.calcite.rex.RexNode)4 MessageType (org.apache.parquet.schema.MessageType)4 BitSet (java.util.BitSet)3 ValueVector (org.apache.drill.exec.vector.ValueVector)3 NameSegment (org.apache.drill.common.expression.PathSegment.NameSegment)2 MinorType (org.apache.drill.common.types.TypeProtos.MinorType)2 TypedFieldId (org.apache.drill.exec.record.TypedFieldId)2 ColumnMetadata (org.apache.drill.exec.record.metadata.ColumnMetadata)2 ListWriter (org.apache.drill.exec.vector.complex.writer.BaseWriter.ListWriter)2 MapWriter (org.apache.drill.exec.vector.complex.writer.BaseWriter.MapWriter)2