Search in sources :

Example 6 with WindowedDataSegment

use of org.apache.druid.indexer.hadoop.WindowedDataSegment in project druid by druid-io.

the class BatchDeltaIngestionTest method testReindexingWithNewAggregators.

/**
 * By default re-indexing expects same aggregators as used by original indexing job. But, with additional flag
 * "useNewAggs" in DatasourcePathSpec, user can optionally have any set of aggregators.
 * See https://github.com/apache/druid/issues/5277 .
 */
@Test
public void testReindexingWithNewAggregators() throws Exception {
    List<WindowedDataSegment> segments = ImmutableList.of(new WindowedDataSegment(SEGMENT, INTERVAL_FULL));
    AggregatorFactory[] aggregators = new AggregatorFactory[] { new LongSumAggregatorFactory("visited_sum2", "visited_sum"), new HyperUniquesAggregatorFactory("unique_hosts2", "unique_hosts") };
    Map<String, Object> inputSpec = ImmutableMap.of("type", "dataSource", "ingestionSpec", ImmutableMap.of("dataSource", "testds", "interval", INTERVAL_FULL), "segments", segments, "useNewAggs", true);
    File tmpDir = temporaryFolder.newFolder();
    HadoopDruidIndexerConfig config = makeHadoopDruidIndexerConfig(inputSpec, tmpDir, aggregators);
    List<ImmutableMap<String, Object>> expectedRows = ImmutableList.of(ImmutableMap.of("time", DateTimes.of("2014-10-22T00:00:00.000Z"), "host", ImmutableList.of("a.example.com"), "visited_sum2", 100L, "unique_hosts2", 1.0d), ImmutableMap.of("time", DateTimes.of("2014-10-22T01:00:00.000Z"), "host", ImmutableList.of("b.example.com"), "visited_sum2", 150L, "unique_hosts2", 1.0d), ImmutableMap.of("time", DateTimes.of("2014-10-22T02:00:00.000Z"), "host", ImmutableList.of("c.example.com"), "visited_sum2", 200L, "unique_hosts2", 1.0d));
    testIngestion(config, expectedRows, Iterables.getOnlyElement(segments), ImmutableList.of("host"), ImmutableList.of("visited_sum2", "unique_hosts2"));
}
Also used : LongSumAggregatorFactory(org.apache.druid.query.aggregation.LongSumAggregatorFactory) LongSumAggregatorFactory(org.apache.druid.query.aggregation.LongSumAggregatorFactory) HyperUniquesAggregatorFactory(org.apache.druid.query.aggregation.hyperloglog.HyperUniquesAggregatorFactory) AggregatorFactory(org.apache.druid.query.aggregation.AggregatorFactory) ImmutableMap(com.google.common.collect.ImmutableMap) WindowedDataSegment(org.apache.druid.indexer.hadoop.WindowedDataSegment) HyperUniquesAggregatorFactory(org.apache.druid.query.aggregation.hyperloglog.HyperUniquesAggregatorFactory) File(java.io.File) Test(org.junit.Test)

Example 7 with WindowedDataSegment

use of org.apache.druid.indexer.hadoop.WindowedDataSegment in project druid by druid-io.

the class DatasourcePathSpec method addInputPaths.

@Override
public Job addInputPaths(HadoopDruidIndexerConfig config, Job job) throws IOException {
    if (segments == null || segments.isEmpty()) {
        if (ingestionSpec.isIgnoreWhenNoSegments()) {
            logger.warn("No segments found for ingestionSpec [%s]", ingestionSpec);
            return job;
        } else {
            throw new ISE("No segments found for ingestion spec [%s]", ingestionSpec);
        }
    }
    logger.info("Found total [%d] segments for [%s]  in interval [%s]", segments.size(), ingestionSpec.getDataSource(), ingestionSpec.getIntervals());
    DatasourceIngestionSpec updatedIngestionSpec = ingestionSpec;
    if (updatedIngestionSpec.getDimensions() == null) {
        List<String> dims;
        if (config.getParser().getParseSpec().getDimensionsSpec().hasCustomDimensions()) {
            dims = config.getParser().getParseSpec().getDimensionsSpec().getDimensionNames();
        } else {
            Set<String> dimSet = Sets.newHashSet(Iterables.concat(Iterables.transform(segments, new Function<WindowedDataSegment, Iterable<String>>() {

                @Override
                public Iterable<String> apply(WindowedDataSegment dataSegment) {
                    return dataSegment.getSegment().getDimensions();
                }
            })));
            dims = Lists.newArrayList(Sets.difference(dimSet, config.getParser().getParseSpec().getDimensionsSpec().getDimensionExclusions()));
        }
        updatedIngestionSpec = updatedIngestionSpec.withDimensions(dims);
    }
    if (updatedIngestionSpec.getMetrics() == null) {
        Set<String> metrics = new HashSet<>();
        final AggregatorFactory[] cols = config.getSchema().getDataSchema().getAggregators();
        if (cols != null) {
            if (useNewAggs) {
                for (AggregatorFactory col : cols) {
                    metrics.addAll(col.requiredFields());
                }
            } else {
                for (AggregatorFactory col : cols) {
                    metrics.add(col.getName());
                }
            }
        }
        updatedIngestionSpec = updatedIngestionSpec.withMetrics(Lists.newArrayList(metrics));
    }
    updatedIngestionSpec = updatedIngestionSpec.withQueryGranularity(config.getGranularitySpec().getQueryGranularity());
    // propagate in the transformSpec from the overall job config
    updatedIngestionSpec = updatedIngestionSpec.withTransformSpec(config.getSchema().getDataSchema().getTransformSpec());
    DatasourceInputFormat.addDataSource(job.getConfiguration(), updatedIngestionSpec, segments, maxSplitSize);
    MultipleInputs.addInputPath(job, new Path("/dummy/tobe/ignored"), DatasourceInputFormat.class);
    return job;
}
Also used : DatasourceIngestionSpec(org.apache.druid.indexer.hadoop.DatasourceIngestionSpec) Path(org.apache.hadoop.fs.Path) AggregatorFactory(org.apache.druid.query.aggregation.AggregatorFactory) WindowedDataSegment(org.apache.druid.indexer.hadoop.WindowedDataSegment) ISE(org.apache.druid.java.util.common.ISE) HashSet(java.util.HashSet)

Example 8 with WindowedDataSegment

use of org.apache.druid.indexer.hadoop.WindowedDataSegment in project druid by druid-io.

the class BatchDeltaIngestionTest method testReindexing.

@Test
public void testReindexing() throws Exception {
    List<WindowedDataSegment> segments = ImmutableList.of(new WindowedDataSegment(SEGMENT, INTERVAL_FULL));
    HadoopDruidIndexerConfig config = makeHadoopDruidIndexerConfig(ImmutableMap.of("type", "dataSource", "ingestionSpec", ImmutableMap.of("dataSource", "testds", "interval", INTERVAL_FULL), "segments", segments), temporaryFolder.newFolder());
    List<ImmutableMap<String, Object>> expectedRows = ImmutableList.of(ImmutableMap.of("time", DateTimes.of("2014-10-22T00:00:00.000Z"), "host", ImmutableList.of("a.example.com"), "visited_sum", 100L, "unique_hosts", 1.0d), ImmutableMap.of("time", DateTimes.of("2014-10-22T01:00:00.000Z"), "host", ImmutableList.of("b.example.com"), "visited_sum", 150L, "unique_hosts", 1.0d), ImmutableMap.of("time", DateTimes.of("2014-10-22T02:00:00.000Z"), "host", ImmutableList.of("c.example.com"), "visited_sum", 200L, "unique_hosts", 1.0d));
    testIngestion(config, expectedRows, Iterables.getOnlyElement(segments), ImmutableList.of("host"), ImmutableList.of("visited_sum", "unique_hosts"));
}
Also used : WindowedDataSegment(org.apache.druid.indexer.hadoop.WindowedDataSegment) ImmutableMap(com.google.common.collect.ImmutableMap) Test(org.junit.Test)

Aggregations

WindowedDataSegment (org.apache.druid.indexer.hadoop.WindowedDataSegment)8 Test (org.junit.Test)6 ImmutableMap (com.google.common.collect.ImmutableMap)4 DatasourceIngestionSpec (org.apache.druid.indexer.hadoop.DatasourceIngestionSpec)4 File (java.io.File)2 HashSet (java.util.HashSet)2 DatasourcePathSpec (org.apache.druid.indexer.path.DatasourcePathSpec)2 MultiplePathSpec (org.apache.druid.indexer.path.MultiplePathSpec)2 PathSpec (org.apache.druid.indexer.path.PathSpec)2 StaticPathSpec (org.apache.druid.indexer.path.StaticPathSpec)2 AggregatorFactory (org.apache.druid.query.aggregation.AggregatorFactory)2 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 List (java.util.List)1 Map (java.util.Map)1 ISE (org.apache.druid.java.util.common.ISE)1 LongSumAggregatorFactory (org.apache.druid.query.aggregation.LongSumAggregatorFactory)1 HyperUniquesAggregatorFactory (org.apache.druid.query.aggregation.hyperloglog.HyperUniquesAggregatorFactory)1 DataSegment (org.apache.druid.timeline.DataSegment)1 TimelineObjectHolder (org.apache.druid.timeline.TimelineObjectHolder)1