use of org.apache.druid.query.groupby.GroupByQuery in project druid by druid-io.
the class SketchAggregationTest method testThetaCardinalityOnSimpleColumn.
@Test
public void testThetaCardinalityOnSimpleColumn() throws Exception {
final GroupByQuery groupByQuery = readQueryFromClasspath("simple_test_data_group_by_query.json", helper.getObjectMapper(), vectorize);
final Sequence<ResultRow> seq = helper.createIndexAndRunQueryOnSegment(new File(SketchAggregationTest.class.getClassLoader().getResource("simple_test_data.tsv").getFile()), readFileFromClasspathAsString("simple_test_data_record_parser2.json"), "[" + " {" + " \"type\": \"count\"," + " \"name\": \"count\"" + " }" + "]", 0, Granularities.NONE, 1000, groupByQuery);
List<ResultRow> results = seq.toList();
Assert.assertEquals(5, results.size());
Assert.assertEquals(ImmutableList.of(new MapBasedRow(DateTimes.of("2014-10-19T00:00:00.000Z"), ImmutableMap.<String, Object>builder().put("product", "product_3").put("sketch_count", 38.0).put("sketchEstimatePostAgg", 38.0).put("sketchUnionPostAggEstimate", 38.0).put("sketchIntersectionPostAggEstimate", 38.0).put("sketchAnotBPostAggEstimate", 0.0).put("non_existing_col_validation", 0.0).build()), new MapBasedRow(DateTimes.of("2014-10-19T00:00:00.000Z"), ImmutableMap.<String, Object>builder().put("product", "product_1").put("sketch_count", 42.0).put("sketchEstimatePostAgg", 42.0).put("sketchUnionPostAggEstimate", 42.0).put("sketchIntersectionPostAggEstimate", 42.0).put("sketchAnotBPostAggEstimate", 0.0).put("non_existing_col_validation", 0.0).build()), new MapBasedRow(DateTimes.of("2014-10-19T00:00:00.000Z"), ImmutableMap.<String, Object>builder().put("product", "product_2").put("sketch_count", 42.0).put("sketchEstimatePostAgg", 42.0).put("sketchUnionPostAggEstimate", 42.0).put("sketchIntersectionPostAggEstimate", 42.0).put("sketchAnotBPostAggEstimate", 0.0).put("non_existing_col_validation", 0.0).build()), new MapBasedRow(DateTimes.of("2014-10-19T00:00:00.000Z"), ImmutableMap.<String, Object>builder().put("product", "product_4").put("sketch_count", 42.0).put("sketchEstimatePostAgg", 42.0).put("sketchUnionPostAggEstimate", 42.0).put("sketchIntersectionPostAggEstimate", 42.0).put("sketchAnotBPostAggEstimate", 0.0).put("non_existing_col_validation", 0.0).build()), new MapBasedRow(DateTimes.of("2014-10-19T00:00:00.000Z"), ImmutableMap.<String, Object>builder().put("product", "product_5").put("sketch_count", 42.0).put("sketchEstimatePostAgg", 42.0).put("sketchUnionPostAggEstimate", 42.0).put("sketchIntersectionPostAggEstimate", 42.0).put("sketchAnotBPostAggEstimate", 0.0).put("non_existing_col_validation", 0.0).build())).stream().map(row -> ResultRow.fromLegacyRow(row, groupByQuery)).collect(Collectors.toList()), results);
}
use of org.apache.druid.query.groupby.GroupByQuery in project druid by druid-io.
the class OldApiSketchAggregationTest method testSimpleDataIngestAndQuery.
@Test
public void testSimpleDataIngestAndQuery() throws Exception {
final String groupByQueryString = readFileFromClasspathAsString("oldapi/old_simple_test_data_group_by_query.json");
final GroupByQuery groupByQuery = (GroupByQuery) helper.getObjectMapper().readValue(groupByQueryString, Query.class);
final Sequence seq = helper.createIndexAndRunQueryOnSegment(new File(this.getClass().getClassLoader().getResource("simple_test_data.tsv").getFile()), readFileFromClasspathAsString("simple_test_data_record_parser.json"), readFileFromClasspathAsString("oldapi/old_simple_test_data_aggregators.json"), 0, Granularities.NONE, 1000, groupByQueryString);
List results = seq.toList();
Assert.assertEquals(1, results.size());
Assert.assertEquals(ResultRow.fromLegacyRow(new MapBasedRow(DateTimes.of("2014-10-19T00:00:00.000Z"), ImmutableMap.<String, Object>builder().put("sketch_count", 50.0).put("sketchEstimatePostAgg", 50.0).put("sketchUnionPostAggEstimate", 50.0).put("sketchIntersectionPostAggEstimate", 50.0).put("sketchAnotBPostAggEstimate", 0.0).put("non_existing_col_validation", 0.0).build()), groupByQuery), results.get(0));
}
use of org.apache.druid.query.groupby.GroupByQuery in project druid by druid-io.
the class GroupByStrategyV2 method mergeResults.
@Override
public Sequence<ResultRow> mergeResults(final QueryRunner<ResultRow> baseRunner, final GroupByQuery query, final ResponseContext responseContext) {
// Merge streams using ResultMergeQueryRunner, then apply postaggregators, then apply limit (which may
// involve materialization)
final ResultMergeQueryRunner<ResultRow> mergingQueryRunner = new ResultMergeQueryRunner<>(baseRunner, this::createResultComparator, this::createMergeFn);
// Set up downstream context.
final ImmutableMap.Builder<String, Object> context = ImmutableMap.builder();
context.put("finalize", false);
context.put(GroupByQueryConfig.CTX_KEY_STRATEGY, GroupByStrategySelector.STRATEGY_V2);
context.put(CTX_KEY_OUTERMOST, false);
Granularity granularity = query.getGranularity();
List<DimensionSpec> dimensionSpecs = query.getDimensions();
// the CTX_TIMESTAMP_RESULT_FIELD is set in DruidQuery.java
final String timestampResultField = query.getContextValue(GroupByQuery.CTX_TIMESTAMP_RESULT_FIELD);
final boolean hasTimestampResultField = (timestampResultField != null && !timestampResultField.isEmpty()) && query.getContextBoolean(CTX_KEY_OUTERMOST, true) && !query.isApplyLimitPushDown();
int timestampResultFieldIndex = 0;
if (hasTimestampResultField) {
// sql like "group by city_id,time_floor(__time to day)",
// the original translated query is granularity=all and dimensions:[d0, d1]
// the better plan is granularity=day and dimensions:[d0]
// but the ResultRow structure is changed from [d0, d1] to [__time, d0]
// this structure should be fixed as [d0, d1] (actually it is [d0, __time]) before postAggs are called.
//
// the above is the general idea of this optimization.
// but from coding perspective, the granularity=all and "d0" dimension are referenced by many places,
// eg: subtotals, having, grouping set, post agg,
// there would be many many places need to be fixed if "d0" dimension is removed from query.dimensions
// and the same to the granularity change.
// so from easier coding perspective, this optimization is coded as groupby engine-level inner process change.
// the most part of codes are in GroupByStrategyV2 about the process change between broker and compute node.
// the basic logic like nested queries and subtotals are kept unchanged,
// they will still see the granularity=all and the "d0" dimension.
//
// the tradeoff is that GroupByStrategyV2 behaviors differently according to the query contexts set in DruidQuery
// in another word,
// the query generated by "explain plan for select ..." doesn't match to the native query ACTUALLY being executed,
// the granularity and dimensions are slightly different.
// now, part of the query plan logic is handled in GroupByStrategyV2, not only in DruidQuery.toGroupByQuery()
final Granularity timestampResultFieldGranularity = query.getContextValue(GroupByQuery.CTX_TIMESTAMP_RESULT_FIELD_GRANULARITY);
dimensionSpecs = query.getDimensions().stream().filter(dimensionSpec -> !dimensionSpec.getOutputName().equals(timestampResultField)).collect(Collectors.toList());
granularity = timestampResultFieldGranularity;
// when timestampResultField is the last dimension, should set sortByDimsFirst=true,
// otherwise the downstream is sorted by row's timestamp first which makes the final ordering not as expected
timestampResultFieldIndex = query.getContextValue(GroupByQuery.CTX_TIMESTAMP_RESULT_FIELD_INDEX);
if (!query.getContextSortByDimsFirst() && timestampResultFieldIndex == query.getDimensions().size() - 1) {
context.put(GroupByQuery.CTX_KEY_SORT_BY_DIMS_FIRST, true);
}
// it is actually equals to sortByDimsFirst=false
if (query.getContextSortByDimsFirst() && timestampResultFieldIndex == 0) {
context.put(GroupByQuery.CTX_KEY_SORT_BY_DIMS_FIRST, false);
}
// when hasTimestampResultField=true and timestampResultField is neither first nor last dimension,
// the DefaultLimitSpec will always do the reordering
}
final int timestampResultFieldIndexInOriginalDimensions = timestampResultFieldIndex;
if (query.getUniversalTimestamp() != null && !hasTimestampResultField) {
// universalTimestamp works only when granularity is all
// hasTimestampResultField works only when granularity is all
// fudgeTimestamp should not be used when hasTimestampResultField=true due to the row's actual timestamp is used
context.put(CTX_KEY_FUDGE_TIMESTAMP, String.valueOf(query.getUniversalTimestamp().getMillis()));
}
// The having spec shouldn't be passed down, so we need to convey the existing limit push down status
context.put(GroupByQueryConfig.CTX_KEY_APPLY_LIMIT_PUSH_DOWN, query.isApplyLimitPushDown());
// Always request array result rows when passing the query downstream.
context.put(GroupByQueryConfig.CTX_KEY_ARRAY_RESULT_ROWS, true);
final GroupByQuery newQuery = new GroupByQuery(query.getDataSource(), query.getQuerySegmentSpec(), query.getVirtualColumns(), query.getDimFilter(), granularity, dimensionSpecs, query.getAggregatorSpecs(), // Don't apply postaggregators on compute nodes
ImmutableList.of(), // Don't do "having" clause until the end of this method.
null, // higher-up).
query.isApplyLimitPushDown() ? ((DefaultLimitSpec) query.getLimitSpec()).withOffsetToLimit() : null, query.getSubtotalsSpec(), query.getContext()).withOverriddenContext(context.build());
final Sequence<ResultRow> mergedResults = mergingQueryRunner.run(QueryPlus.wrap(newQuery), responseContext);
if (!query.getContextBoolean(CTX_KEY_OUTERMOST, true) || query.getContextBoolean(GroupByQueryConfig.CTX_KEY_EXECUTING_NESTED_QUERY, false)) {
return mergedResults;
} else if (query.getPostAggregatorSpecs().isEmpty()) {
if (!hasTimestampResultField) {
return mergedResults;
}
return Sequences.map(mergedResults, row -> {
final ResultRow resultRow = ResultRow.create(query.getResultRowSizeWithoutPostAggregators());
moveOrReplicateTimestampInRow(query, timestampResultFieldIndexInOriginalDimensions, row, resultRow);
return resultRow;
});
} else {
return Sequences.map(mergedResults, row -> {
// This function's purpose is to apply PostAggregators.
final ResultRow rowWithPostAggregations = ResultRow.create(query.getResultRowSizeWithPostAggregators());
// Copy everything that comes before the postaggregations.
if (hasTimestampResultField) {
moveOrReplicateTimestampInRow(query, timestampResultFieldIndexInOriginalDimensions, row, rowWithPostAggregations);
} else {
for (int i = 0; i < query.getResultRowPostAggregatorStart(); i++) {
rowWithPostAggregations.set(i, row.get(i));
}
}
// Compute postaggregations. We need to do this with a result-row map because PostAggregator.compute
// expects a map. Some further design adjustment may eliminate the need for it, and speed up this function.
final Map<String, Object> mapForPostAggregationComputation = rowWithPostAggregations.toMap(query);
for (int i = 0; i < query.getPostAggregatorSpecs().size(); i++) {
final PostAggregator postAggregator = query.getPostAggregatorSpecs().get(i);
final Object value = postAggregator.compute(mapForPostAggregationComputation);
rowWithPostAggregations.set(query.getResultRowPostAggregatorStart() + i, value);
mapForPostAggregationComputation.put(postAggregator.getName(), value);
}
return rowWithPostAggregations;
});
}
}
use of org.apache.druid.query.groupby.GroupByQuery in project druid by druid-io.
the class GroupByStrategyV2 method processSubtotalsSpec.
@Override
public Sequence<ResultRow> processSubtotalsSpec(GroupByQuery query, GroupByQueryResource resource, Sequence<ResultRow> queryResult) {
// How it works?
// First we accumulate the result of top level base query aka queryResult arg inside a resultSupplierOne object.
// Next for each subtotalSpec
// If subtotalSpec is a prefix of top level dims then we iterate on rows in resultSupplierOne object which are still
// sorted by subtotalSpec, stream merge them and return.
//
// If subtotalSpec is not a prefix of top level dims then we create a resultSupplierTwo object filled with rows from
// resultSupplierOne object with only dims from subtotalSpec. Then we iterate on rows in resultSupplierTwo object which are
// of course sorted by subtotalSpec, stream merge them and return.
// Keep a reference to resultSupplier outside the "try" so we can close it if something goes wrong
// while creating the sequence.
GroupByRowProcessor.ResultSupplier resultSupplierOne = null;
try {
// baseSubtotalQuery is the original query with dimensions and aggregators rewritten to apply to the *results*
// rather than *inputs* of that query. It has its virtual columns and dim filter removed, because those only
// make sense when applied to inputs. Finally, it has subtotalsSpec removed, since we'll be computing them
// one-by-one soon enough.
GroupByQuery baseSubtotalQuery = query.withDimensionSpecs(query.getDimensions().stream().map(dimSpec -> new DefaultDimensionSpec(dimSpec.getOutputName(), dimSpec.getOutputName(), dimSpec.getOutputType())).collect(Collectors.toList())).withAggregatorSpecs(query.getAggregatorSpecs().stream().map(AggregatorFactory::getCombiningFactory).collect(Collectors.toList())).withVirtualColumns(VirtualColumns.EMPTY).withDimFilter(null).withSubtotalsSpec(null).withOverriddenContext(ImmutableMap.of(GroupByQuery.CTX_TIMESTAMP_RESULT_FIELD, ""));
resultSupplierOne = GroupByRowProcessor.process(baseSubtotalQuery, baseSubtotalQuery, queryResult, configSupplier.get(), resource, spillMapper, processingConfig.getTmpDir(), processingConfig.intermediateComputeSizeBytes());
List<String> queryDimNames = baseSubtotalQuery.getDimensions().stream().map(DimensionSpec::getOutputName).collect(Collectors.toList());
// Only needed to make LimitSpec.filterColumns(..) call later in case base query has a non default LimitSpec.
Set<String> aggsAndPostAggs = null;
if (!(baseSubtotalQuery.getLimitSpec() instanceof NoopLimitSpec)) {
aggsAndPostAggs = getAggregatorAndPostAggregatorNames(baseSubtotalQuery);
}
List<List<String>> subtotals = query.getSubtotalsSpec();
List<Sequence<ResultRow>> subtotalsResults = new ArrayList<>(subtotals.size());
// Iterate through each subtotalSpec, build results for it and add to subtotalsResults
for (List<String> subtotalSpec : subtotals) {
final ImmutableSet<String> dimsInSubtotalSpec = ImmutableSet.copyOf(subtotalSpec);
// Dimension spec including dimension name and output name
final List<DimensionSpec> subTotalDimensionSpec = new ArrayList<>(dimsInSubtotalSpec.size());
final List<DimensionSpec> dimensions = query.getDimensions();
for (DimensionSpec dimensionSpec : dimensions) {
if (dimsInSubtotalSpec.contains(dimensionSpec.getOutputName())) {
subTotalDimensionSpec.add(dimensionSpec);
}
}
// Create appropriate LimitSpec for subtotal query
LimitSpec subtotalQueryLimitSpec = NoopLimitSpec.instance();
if (!(baseSubtotalQuery.getLimitSpec() instanceof NoopLimitSpec)) {
Set<String> columns = new HashSet<>(aggsAndPostAggs);
columns.addAll(subtotalSpec);
subtotalQueryLimitSpec = baseSubtotalQuery.getLimitSpec().filterColumns(columns);
}
GroupByQuery subtotalQuery = baseSubtotalQuery.withLimitSpec(subtotalQueryLimitSpec);
final GroupByRowProcessor.ResultSupplier resultSupplierOneFinal = resultSupplierOne;
if (Utils.isPrefix(subtotalSpec, queryDimNames)) {
// Since subtotalSpec is a prefix of base query dimensions, so results from base query are also sorted
// by subtotalSpec as needed by stream merging.
subtotalsResults.add(processSubtotalsResultAndOptionallyClose(() -> resultSupplierOneFinal, subTotalDimensionSpec, subtotalQuery, false));
} else {
// Since subtotalSpec is not a prefix of base query dimensions, so results from base query are not sorted
// by subtotalSpec. So we first add the result of base query into another resultSupplier which are sorted
// by subtotalSpec and then stream merge them.
// Also note, we can't create the ResultSupplier eagerly here or as we don't want to eagerly allocate
// merge buffers for processing subtotal.
Supplier<GroupByRowProcessor.ResultSupplier> resultSupplierTwo = () -> GroupByRowProcessor.process(baseSubtotalQuery, subtotalQuery, resultSupplierOneFinal.results(subTotalDimensionSpec), configSupplier.get(), resource, spillMapper, processingConfig.getTmpDir(), processingConfig.intermediateComputeSizeBytes());
subtotalsResults.add(processSubtotalsResultAndOptionallyClose(resultSupplierTwo, subTotalDimensionSpec, subtotalQuery, true));
}
}
return Sequences.withBaggage(query.postProcess(Sequences.concat(subtotalsResults)), // this will close resources allocated by resultSupplierOne after sequence read
resultSupplierOne);
} catch (Throwable e) {
throw CloseableUtils.closeAndWrapInCatch(e, resultSupplierOne);
}
}
use of org.apache.druid.query.groupby.GroupByQuery in project druid by druid-io.
the class GroupByStrategyV2 method numMergeBuffersNeededForSubtotalsSpec.
private static int numMergeBuffersNeededForSubtotalsSpec(GroupByQuery query) {
List<List<String>> subtotalSpecs = query.getSubtotalsSpec();
final DataSource dataSource = query.getDataSource();
int numMergeBuffersNeededForSubQuerySubtotal = 0;
if (dataSource instanceof QueryDataSource) {
Query<?> subQuery = ((QueryDataSource) dataSource).getQuery();
if (subQuery instanceof GroupByQuery) {
numMergeBuffersNeededForSubQuerySubtotal = numMergeBuffersNeededForSubtotalsSpec((GroupByQuery) subQuery);
}
}
if (subtotalSpecs == null || subtotalSpecs.size() == 0) {
return numMergeBuffersNeededForSubQuerySubtotal;
}
List<String> queryDimOutputNames = query.getDimensions().stream().map(DimensionSpec::getOutputName).collect(Collectors.toList());
for (List<String> subtotalSpec : subtotalSpecs) {
if (!Utils.isPrefix(subtotalSpec, queryDimOutputNames)) {
return 2;
}
}
return Math.max(1, numMergeBuffersNeededForSubQuerySubtotal);
}
Aggregations