Search in sources :

Example 76 with Type

use of org.apache.parquet.schema.Type in project druid by druid-io.

the class DruidParquetAvroReadSupport method getPartialReadSchema.

/**
 * Select the columns from the parquet schema that are used in the schema of the ingestion job
 *
 * @param context The context of the file to be read
 *
 * @return the partial schema that only contains the columns that are being used in the schema
 */
private MessageType getPartialReadSchema(InitContext context) {
    MessageType fullSchema = context.getFileSchema();
    String name = fullSchema.getName();
    HadoopDruidIndexerConfig config = HadoopDruidIndexerConfig.fromConfiguration(context.getConfiguration());
    ParseSpec parseSpec = config.getParser().getParseSpec();
    if (parseSpec instanceof AvroParseSpec) {
        if (((AvroParseSpec) parseSpec).getFlattenSpec() != null) {
            return fullSchema;
        }
    }
    String tsField = config.getParser().getParseSpec().getTimestampSpec().getTimestampColumn();
    List<DimensionSchema> dimensionSchema = config.getParser().getParseSpec().getDimensionsSpec().getDimensions();
    Set<String> dimensions = new HashSet<>();
    for (DimensionSchema dim : dimensionSchema) {
        dimensions.add(dim.getName());
    }
    Set<String> metricsFields = new HashSet<>();
    for (AggregatorFactory agg : config.getSchema().getDataSchema().getAggregators()) {
        metricsFields.addAll(agg.requiredFields());
    }
    List<Type> partialFields = new ArrayList<>();
    for (Type type : fullSchema.getFields()) {
        if (tsField.equals(type.getName()) || metricsFields.contains(type.getName()) || dimensions.size() > 0 && dimensions.contains(type.getName()) || dimensions.size() == 0) {
            partialFields.add(type);
        }
    }
    return new MessageType(name, partialFields);
}
Also used : ParseSpec(org.apache.druid.data.input.impl.ParseSpec) AvroParseSpec(org.apache.druid.data.input.avro.AvroParseSpec) ArrayList(java.util.ArrayList) HadoopDruidIndexerConfig(org.apache.druid.indexer.HadoopDruidIndexerConfig) AggregatorFactory(org.apache.druid.query.aggregation.AggregatorFactory) DimensionSchema(org.apache.druid.data.input.impl.DimensionSchema) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) AvroParseSpec(org.apache.druid.data.input.avro.AvroParseSpec) MessageType(org.apache.parquet.schema.MessageType) HashSet(java.util.HashSet)

Example 77 with Type

use of org.apache.parquet.schema.Type in project druid by druid-io.

the class ParquetGroupConverter method convertField.

/**
 * See {@link ParquetGroupConverter#convertField(Group, String)}
 */
@Nullable
private static Object convertField(Group g, String fieldName, boolean binaryAsString) {
    if (!g.getType().containsField(fieldName)) {
        return null;
    }
    final int fieldIndex = g.getType().getFieldIndex(fieldName);
    if (g.getFieldRepetitionCount(fieldIndex) <= 0) {
        return null;
    }
    Type fieldType = g.getType().getFields().get(fieldIndex);
    // primitive field
    if (fieldType.isPrimitive()) {
        // primitive list
        if (fieldType.getRepetition().equals(Type.Repetition.REPEATED)) {
            int repeated = g.getFieldRepetitionCount(fieldIndex);
            List<Object> vals = new ArrayList<>();
            for (int i = 0; i < repeated; i++) {
                vals.add(convertPrimitiveField(g, fieldIndex, i, binaryAsString));
            }
            return vals;
        }
        return convertPrimitiveField(g, fieldIndex, binaryAsString);
    } else {
        if (fieldType.isRepetition(Type.Repetition.REPEATED)) {
            return convertRepeatedFieldToList(g, fieldIndex, binaryAsString);
        }
        if (isLogicalMapType(fieldType)) {
            return convertLogicalMap(g.getGroup(fieldIndex, 0), binaryAsString);
        }
        if (isLogicalListType(fieldType)) {
            return convertLogicalList(g.getGroup(fieldIndex, 0), binaryAsString);
        }
        // not a list, but not a primitive, return the nested group type
        return g.getGroup(fieldIndex, 0);
    }
}
Also used : PrimitiveType(org.apache.parquet.schema.PrimitiveType) GroupType(org.apache.parquet.schema.GroupType) Type(org.apache.parquet.schema.Type) OriginalType(org.apache.parquet.schema.OriginalType) ArrayList(java.util.ArrayList) Nullable(javax.annotation.Nullable)

Example 78 with Type

use of org.apache.parquet.schema.Type in project druid by druid-io.

the class ParquetGroupConverter method convertRepeatedFieldToList.

/**
 * convert a repeated field into a list of primitives or groups
 */
private static List<Object> convertRepeatedFieldToList(Group g, int fieldIndex, boolean binaryAsString) {
    Type t = g.getType().getFields().get(fieldIndex);
    assert t.getRepetition().equals(Type.Repetition.REPEATED);
    int repeated = g.getFieldRepetitionCount(fieldIndex);
    List<Object> vals = new ArrayList<>();
    for (int i = 0; i < repeated; i++) {
        if (t.isPrimitive()) {
            vals.add(convertPrimitiveField(g, fieldIndex, i, binaryAsString));
        } else {
            vals.add(g.getGroup(fieldIndex, i));
        }
    }
    return vals;
}
Also used : PrimitiveType(org.apache.parquet.schema.PrimitiveType) GroupType(org.apache.parquet.schema.GroupType) Type(org.apache.parquet.schema.Type) OriginalType(org.apache.parquet.schema.OriginalType) ArrayList(java.util.ArrayList)

Aggregations

Type (org.apache.parquet.schema.Type)78 GroupType (org.apache.parquet.schema.GroupType)67 MessageType (org.apache.parquet.schema.MessageType)62 OriginalType (org.apache.parquet.schema.OriginalType)39 PrimitiveType (org.apache.parquet.schema.PrimitiveType)34 ArrayList (java.util.ArrayList)24 SchemaPath (org.apache.drill.common.expression.SchemaPath)10 HashMap (java.util.HashMap)9 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)9 PathSegment (org.apache.drill.common.expression.PathSegment)8 Converter (org.apache.parquet.io.api.Converter)6 GroupConverter (org.apache.parquet.io.api.GroupConverter)6 MinorType (org.apache.drill.common.types.TypeProtos.MinorType)5 MaterializedField (org.apache.drill.exec.record.MaterializedField)5 Collection (java.util.Collection)4 List (java.util.List)4 Function (java.util.function.Function)4 LogicalType (org.apache.avro.LogicalType)4 DrillRuntimeException (org.apache.drill.common.exceptions.DrillRuntimeException)4 ExecConstants (org.apache.drill.exec.ExecConstants)4