Examples with HadoopDruidIndexerConfig - org.apache.druid.indexer.HadoopDruidIndexerConfig

Example 26 with HadoopDruidIndexerConfig

use of org.apache.druid.indexer.HadoopDruidIndexerConfig in project druid by druid-io.

the class FlattenSpecParquetInputTest method testFlat1Autodiscover.

@Test
public void testFlat1Autodiscover() throws IOException, InterruptedException {
    HadoopDruidIndexerConfig config = transformHadoopDruidIndexerConfig("example/flattening/flat_1_autodiscover_fields.json", parserType, true);
    config.intoConfiguration(job);
    Object data = getFirstRow(job, parserType, ((StaticPathSpec) config.getPathSpec()).getPaths());
    List<InputRow> rows = (List<InputRow>) config.getParser().parseBatch(data);
    Assert.assertEquals(TS1, rows.get(0).getTimestamp().toString());
    Assert.assertEquals("d1v1", rows.get(0).getDimension("dim1").get(0));
    Assert.assertEquals("d2v1", rows.get(0).getDimension("dim2").get(0));
    Assert.assertEquals("1", rows.get(0).getDimension("dim3").get(0));
    Assert.assertEquals("listDim1v1", rows.get(0).getDimension("listDim").get(0));
    Assert.assertEquals("listDim1v2", rows.get(0).getDimension("listDim").get(1));
    Assert.assertEquals(1, rows.get(0).getMetric("metric1").longValue());
}

Also used : InputRow(org.apache.druid.data.input.InputRow) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) HadoopDruidIndexerConfig(org.apache.druid.indexer.HadoopDruidIndexerConfig) Test(org.junit.Test)

Example 27 with HadoopDruidIndexerConfig

use of org.apache.druid.indexer.HadoopDruidIndexerConfig in project druid by druid-io.

the class FlattenSpecParquetInputTest method testFlat1NoFlattenSpec.

@Test
public void testFlat1NoFlattenSpec() throws IOException, InterruptedException {
    HadoopDruidIndexerConfig config = transformHadoopDruidIndexerConfig("example/flattening/flat_1.json", parserType, false);
    config.intoConfiguration(job);
    Object data = getFirstRow(job, parserType, ((StaticPathSpec) config.getPathSpec()).getPaths());
    List<InputRow> rows = (List<InputRow>) config.getParser().parseBatch(data);
    Assert.assertEquals(TS1, rows.get(0).getTimestamp().toString());
    Assert.assertEquals("d1v1", rows.get(0).getDimension("dim1").get(0));
    Assert.assertEquals("d2v1", rows.get(0).getDimension("dim2").get(0));
    Assert.assertEquals("1", rows.get(0).getDimension("dim3").get(0));
    Assert.assertEquals("listDim1v1", rows.get(0).getDimension("listDim").get(0));
    Assert.assertEquals("listDim1v2", rows.get(0).getDimension("listDim").get(1));
    Assert.assertEquals(1, rows.get(0).getMetric("metric1").longValue());
}

Example 28 with HadoopDruidIndexerConfig

use of org.apache.druid.indexer.HadoopDruidIndexerConfig in project druid by druid-io.

the class DruidParquetAvroReadSupport method getPartialReadSchema.

/**
 * Select the columns from the parquet schema that are used in the schema of the ingestion job
 *
 * @param context The context of the file to be read
 *
 * @return the partial schema that only contains the columns that are being used in the schema
 */
private MessageType getPartialReadSchema(InitContext context) {
    MessageType fullSchema = context.getFileSchema();
    String name = fullSchema.getName();
    HadoopDruidIndexerConfig config = HadoopDruidIndexerConfig.fromConfiguration(context.getConfiguration());
    ParseSpec parseSpec = config.getParser().getParseSpec();
    if (parseSpec instanceof AvroParseSpec) {
        if (((AvroParseSpec) parseSpec).getFlattenSpec() != null) {
            return fullSchema;
        }
    }
    String tsField = config.getParser().getParseSpec().getTimestampSpec().getTimestampColumn();
    List<DimensionSchema> dimensionSchema = config.getParser().getParseSpec().getDimensionsSpec().getDimensions();
    Set<String> dimensions = new HashSet<>();
    for (DimensionSchema dim : dimensionSchema) {
        dimensions.add(dim.getName());
    }
    Set<String> metricsFields = new HashSet<>();
    for (AggregatorFactory agg : config.getSchema().getDataSchema().getAggregators()) {
        metricsFields.addAll(agg.requiredFields());
    }
    List<Type> partialFields = new ArrayList<>();
    for (Type type : fullSchema.getFields()) {
        if (tsField.equals(type.getName()) || metricsFields.contains(type.getName()) || dimensions.size() > 0 && dimensions.contains(type.getName()) || dimensions.size() == 0) {
            partialFields.add(type);
        }
    }
    return new MessageType(name, partialFields);
}

Also used : ParseSpec(org.apache.druid.data.input.impl.ParseSpec) AvroParseSpec(org.apache.druid.data.input.avro.AvroParseSpec) ArrayList(java.util.ArrayList) HadoopDruidIndexerConfig(org.apache.druid.indexer.HadoopDruidIndexerConfig) AggregatorFactory(org.apache.druid.query.aggregation.AggregatorFactory) DimensionSchema(org.apache.druid.data.input.impl.DimensionSchema) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) AvroParseSpec(org.apache.druid.data.input.avro.AvroParseSpec) MessageType(org.apache.parquet.schema.MessageType) HashSet(java.util.HashSet)

Example 29 with HadoopDruidIndexerConfig

use of org.apache.druid.indexer.HadoopDruidIndexerConfig in project druid by druid-io.

the class DruidParquetReadSupport method getPartialReadSchema.

/**
 * Select the columns from the parquet schema that are used in the schema of the ingestion job
 *
 * @param context The context of the file to be read
 *
 * @return the partial schema that only contains the columns that are being used in the schema
 */
private MessageType getPartialReadSchema(InitContext context) {
    MessageType fullSchema = context.getFileSchema();
    String name = fullSchema.getName();
    HadoopDruidIndexerConfig config = HadoopDruidIndexerConfig.fromConfiguration(context.getConfiguration());
    ParseSpec parseSpec = config.getParser().getParseSpec();
    // parse the flatten spec and determine it isn't auto discovering props?
    if (parseSpec instanceof ParquetParseSpec) {
        if (((ParquetParseSpec) parseSpec).getFlattenSpec() != null) {
            return fullSchema;
        }
    }
    String tsField = parseSpec.getTimestampSpec().getTimestampColumn();
    List<DimensionSchema> dimensionSchema = parseSpec.getDimensionsSpec().getDimensions();
    Set<String> dimensions = new HashSet<>();
    for (DimensionSchema dim : dimensionSchema) {
        dimensions.add(dim.getName());
    }
    Set<String> metricsFields = new HashSet<>();
    for (AggregatorFactory agg : config.getSchema().getDataSchema().getAggregators()) {
        metricsFields.addAll(agg.requiredFields());
    }
    List<Type> partialFields = new ArrayList<>();
    for (Type type : fullSchema.getFields()) {
        if (tsField.equals(type.getName()) || metricsFields.contains(type.getName()) || dimensions.size() > 0 && dimensions.contains(type.getName()) || dimensions.size() == 0) {
            partialFields.add(type);
        }
    }
    return new MessageType(name, partialFields);
}

Also used : ParseSpec(org.apache.druid.data.input.impl.ParseSpec) ArrayList(java.util.ArrayList) HadoopDruidIndexerConfig(org.apache.druid.indexer.HadoopDruidIndexerConfig) AggregatorFactory(org.apache.druid.query.aggregation.AggregatorFactory) DimensionSchema(org.apache.druid.data.input.impl.DimensionSchema) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) MessageType(org.apache.parquet.schema.MessageType) HashSet(java.util.HashSet)

Example 30 with HadoopDruidIndexerConfig

use of org.apache.druid.indexer.HadoopDruidIndexerConfig in project druid by druid-io.

the class OrcHadoopInputRowParserTest method testDate1900.

@Test
public void testDate1900() throws IOException {
    /*
      TestOrcFile.testDate1900.orc
      struct<time:timestamp,date:date>
      {1900-05-05 12:34:56.1, 1900-12-25}
     */
    HadoopDruidIndexerConfig config = loadHadoopDruidIndexerConfig("example/testDate1900_hadoop_job.json");
    Job job = Job.getInstance(new Configuration());
    config.intoConfiguration(job);
    OrcStruct data = getFirstRow(job, ((StaticPathSpec) config.getPathSpec()).getPaths());
    List<InputRow> rows = (List<InputRow>) config.getParser().parseBatch(data);
    Assert.assertEquals(1, rows.get(0).getDimensions().size());
    Assert.assertEquals("1900-12-25T00:00:00.000Z", rows.get(0).getDimension("date").get(0));
    Assert.assertEquals(DateTimes.of("1900-05-05T12:34:56.1Z"), rows.get(0).getTimestamp());
}

Also used : OrcStruct(org.apache.orc.mapred.OrcStruct) Configuration(org.apache.hadoop.conf.Configuration) InputRow(org.apache.druid.data.input.InputRow) ArrayList(java.util.ArrayList) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) HadoopDruidIndexerConfig(org.apache.druid.indexer.HadoopDruidIndexerConfig) Job(org.apache.hadoop.mapreduce.Job) Test(org.junit.Test)

Aggregations

HadoopDruidIndexerConfig (org.apache.druid.indexer.HadoopDruidIndexerConfig)34 Test (org.junit.Test)32 InputRow (org.apache.druid.data.input.InputRow)27 ImmutableList (com.google.common.collect.ImmutableList)19 List (java.util.List)19 Job (org.apache.hadoop.mapreduce.Job)9 ArrayList (java.util.ArrayList)8 Configuration (org.apache.hadoop.conf.Configuration)8 OrcStruct (org.apache.orc.mapred.OrcStruct)6 BigDecimal (java.math.BigDecimal)3 HadoopIngestionSpec (org.apache.druid.indexer.HadoopIngestionSpec)3 IOException (java.io.IOException)2 HashSet (java.util.HashSet)2 DimensionSchema (org.apache.druid.data.input.impl.DimensionSchema)2 ParseSpec (org.apache.druid.data.input.impl.ParseSpec)2 Bucket (org.apache.druid.indexer.Bucket)2 AggregatorFactory (org.apache.druid.query.aggregation.AggregatorFactory)2 DataSegment (org.apache.druid.timeline.DataSegment)2 NumberedShardSpec (org.apache.druid.timeline.partition.NumberedShardSpec)2 Path (org.apache.hadoop.fs.Path)2