Search in sources :

Example 1 with AvroParseSpec

use of org.apache.druid.data.input.avro.AvroParseSpec in project druid by druid-io.

the class DruidParquetAvroReadSupport method getPartialReadSchema.

/**
 * Select the columns from the parquet schema that are used in the schema of the ingestion job
 *
 * @param context The context of the file to be read
 *
 * @return the partial schema that only contains the columns that are being used in the schema
 */
private MessageType getPartialReadSchema(InitContext context) {
    MessageType fullSchema = context.getFileSchema();
    String name = fullSchema.getName();
    HadoopDruidIndexerConfig config = HadoopDruidIndexerConfig.fromConfiguration(context.getConfiguration());
    ParseSpec parseSpec = config.getParser().getParseSpec();
    if (parseSpec instanceof AvroParseSpec) {
        if (((AvroParseSpec) parseSpec).getFlattenSpec() != null) {
            return fullSchema;
        }
    }
    String tsField = config.getParser().getParseSpec().getTimestampSpec().getTimestampColumn();
    List<DimensionSchema> dimensionSchema = config.getParser().getParseSpec().getDimensionsSpec().getDimensions();
    Set<String> dimensions = new HashSet<>();
    for (DimensionSchema dim : dimensionSchema) {
        dimensions.add(dim.getName());
    }
    Set<String> metricsFields = new HashSet<>();
    for (AggregatorFactory agg : config.getSchema().getDataSchema().getAggregators()) {
        metricsFields.addAll(agg.requiredFields());
    }
    List<Type> partialFields = new ArrayList<>();
    for (Type type : fullSchema.getFields()) {
        if (tsField.equals(type.getName()) || metricsFields.contains(type.getName()) || dimensions.size() > 0 && dimensions.contains(type.getName()) || dimensions.size() == 0) {
            partialFields.add(type);
        }
    }
    return new MessageType(name, partialFields);
}
Also used : ParseSpec(org.apache.druid.data.input.impl.ParseSpec) AvroParseSpec(org.apache.druid.data.input.avro.AvroParseSpec) ArrayList(java.util.ArrayList) HadoopDruidIndexerConfig(org.apache.druid.indexer.HadoopDruidIndexerConfig) AggregatorFactory(org.apache.druid.query.aggregation.AggregatorFactory) DimensionSchema(org.apache.druid.data.input.impl.DimensionSchema) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) AvroParseSpec(org.apache.druid.data.input.avro.AvroParseSpec) MessageType(org.apache.parquet.schema.MessageType) HashSet(java.util.HashSet)

Aggregations

ArrayList (java.util.ArrayList)1 HashSet (java.util.HashSet)1 AvroParseSpec (org.apache.druid.data.input.avro.AvroParseSpec)1 DimensionSchema (org.apache.druid.data.input.impl.DimensionSchema)1 ParseSpec (org.apache.druid.data.input.impl.ParseSpec)1 HadoopDruidIndexerConfig (org.apache.druid.indexer.HadoopDruidIndexerConfig)1 AggregatorFactory (org.apache.druid.query.aggregation.AggregatorFactory)1 MessageType (org.apache.parquet.schema.MessageType)1 Type (org.apache.parquet.schema.Type)1