use of org.apache.druid.indexer.hadoop.WindowedDataSegment in project druid by druid-io.
the class BatchDeltaIngestionTest method testReindexingWithNewAggregators.
/**
* By default re-indexing expects same aggregators as used by original indexing job. But, with additional flag
* "useNewAggs" in DatasourcePathSpec, user can optionally have any set of aggregators.
* See https://github.com/apache/druid/issues/5277 .
*/
@Test
public void testReindexingWithNewAggregators() throws Exception {
List<WindowedDataSegment> segments = ImmutableList.of(new WindowedDataSegment(SEGMENT, INTERVAL_FULL));
AggregatorFactory[] aggregators = new AggregatorFactory[] { new LongSumAggregatorFactory("visited_sum2", "visited_sum"), new HyperUniquesAggregatorFactory("unique_hosts2", "unique_hosts") };
Map<String, Object> inputSpec = ImmutableMap.of("type", "dataSource", "ingestionSpec", ImmutableMap.of("dataSource", "testds", "interval", INTERVAL_FULL), "segments", segments, "useNewAggs", true);
File tmpDir = temporaryFolder.newFolder();
HadoopDruidIndexerConfig config = makeHadoopDruidIndexerConfig(inputSpec, tmpDir, aggregators);
List<ImmutableMap<String, Object>> expectedRows = ImmutableList.of(ImmutableMap.of("time", DateTimes.of("2014-10-22T00:00:00.000Z"), "host", ImmutableList.of("a.example.com"), "visited_sum2", 100L, "unique_hosts2", 1.0d), ImmutableMap.of("time", DateTimes.of("2014-10-22T01:00:00.000Z"), "host", ImmutableList.of("b.example.com"), "visited_sum2", 150L, "unique_hosts2", 1.0d), ImmutableMap.of("time", DateTimes.of("2014-10-22T02:00:00.000Z"), "host", ImmutableList.of("c.example.com"), "visited_sum2", 200L, "unique_hosts2", 1.0d));
testIngestion(config, expectedRows, Iterables.getOnlyElement(segments), ImmutableList.of("host"), ImmutableList.of("visited_sum2", "unique_hosts2"));
}
use of org.apache.druid.indexer.hadoop.WindowedDataSegment in project druid by druid-io.
the class DatasourcePathSpec method addInputPaths.
@Override
public Job addInputPaths(HadoopDruidIndexerConfig config, Job job) throws IOException {
if (segments == null || segments.isEmpty()) {
if (ingestionSpec.isIgnoreWhenNoSegments()) {
logger.warn("No segments found for ingestionSpec [%s]", ingestionSpec);
return job;
} else {
throw new ISE("No segments found for ingestion spec [%s]", ingestionSpec);
}
}
logger.info("Found total [%d] segments for [%s] in interval [%s]", segments.size(), ingestionSpec.getDataSource(), ingestionSpec.getIntervals());
DatasourceIngestionSpec updatedIngestionSpec = ingestionSpec;
if (updatedIngestionSpec.getDimensions() == null) {
List<String> dims;
if (config.getParser().getParseSpec().getDimensionsSpec().hasCustomDimensions()) {
dims = config.getParser().getParseSpec().getDimensionsSpec().getDimensionNames();
} else {
Set<String> dimSet = Sets.newHashSet(Iterables.concat(Iterables.transform(segments, new Function<WindowedDataSegment, Iterable<String>>() {
@Override
public Iterable<String> apply(WindowedDataSegment dataSegment) {
return dataSegment.getSegment().getDimensions();
}
})));
dims = Lists.newArrayList(Sets.difference(dimSet, config.getParser().getParseSpec().getDimensionsSpec().getDimensionExclusions()));
}
updatedIngestionSpec = updatedIngestionSpec.withDimensions(dims);
}
if (updatedIngestionSpec.getMetrics() == null) {
Set<String> metrics = new HashSet<>();
final AggregatorFactory[] cols = config.getSchema().getDataSchema().getAggregators();
if (cols != null) {
if (useNewAggs) {
for (AggregatorFactory col : cols) {
metrics.addAll(col.requiredFields());
}
} else {
for (AggregatorFactory col : cols) {
metrics.add(col.getName());
}
}
}
updatedIngestionSpec = updatedIngestionSpec.withMetrics(Lists.newArrayList(metrics));
}
updatedIngestionSpec = updatedIngestionSpec.withQueryGranularity(config.getGranularitySpec().getQueryGranularity());
// propagate in the transformSpec from the overall job config
updatedIngestionSpec = updatedIngestionSpec.withTransformSpec(config.getSchema().getDataSchema().getTransformSpec());
DatasourceInputFormat.addDataSource(job.getConfiguration(), updatedIngestionSpec, segments, maxSplitSize);
MultipleInputs.addInputPath(job, new Path("/dummy/tobe/ignored"), DatasourceInputFormat.class);
return job;
}
use of org.apache.druid.indexer.hadoop.WindowedDataSegment in project druid by druid-io.
the class BatchDeltaIngestionTest method testReindexing.
@Test
public void testReindexing() throws Exception {
List<WindowedDataSegment> segments = ImmutableList.of(new WindowedDataSegment(SEGMENT, INTERVAL_FULL));
HadoopDruidIndexerConfig config = makeHadoopDruidIndexerConfig(ImmutableMap.of("type", "dataSource", "ingestionSpec", ImmutableMap.of("dataSource", "testds", "interval", INTERVAL_FULL), "segments", segments), temporaryFolder.newFolder());
List<ImmutableMap<String, Object>> expectedRows = ImmutableList.of(ImmutableMap.of("time", DateTimes.of("2014-10-22T00:00:00.000Z"), "host", ImmutableList.of("a.example.com"), "visited_sum", 100L, "unique_hosts", 1.0d), ImmutableMap.of("time", DateTimes.of("2014-10-22T01:00:00.000Z"), "host", ImmutableList.of("b.example.com"), "visited_sum", 150L, "unique_hosts", 1.0d), ImmutableMap.of("time", DateTimes.of("2014-10-22T02:00:00.000Z"), "host", ImmutableList.of("c.example.com"), "visited_sum", 200L, "unique_hosts", 1.0d));
testIngestion(config, expectedRows, Iterables.getOnlyElement(segments), ImmutableList.of("host"), ImmutableList.of("visited_sum", "unique_hosts"));
}
Aggregations