use of org.apache.druid.query.groupby.orderby.DefaultLimitSpec in project druid by druid-io.
the class GroupByStrategyV2 method mergeResults.
@Override
public Sequence<ResultRow> mergeResults(final QueryRunner<ResultRow> baseRunner, final GroupByQuery query, final ResponseContext responseContext) {
// Merge streams using ResultMergeQueryRunner, then apply postaggregators, then apply limit (which may
// involve materialization)
final ResultMergeQueryRunner<ResultRow> mergingQueryRunner = new ResultMergeQueryRunner<>(baseRunner, this::createResultComparator, this::createMergeFn);
// Set up downstream context.
final ImmutableMap.Builder<String, Object> context = ImmutableMap.builder();
context.put("finalize", false);
context.put(GroupByQueryConfig.CTX_KEY_STRATEGY, GroupByStrategySelector.STRATEGY_V2);
context.put(CTX_KEY_OUTERMOST, false);
Granularity granularity = query.getGranularity();
List<DimensionSpec> dimensionSpecs = query.getDimensions();
// the CTX_TIMESTAMP_RESULT_FIELD is set in DruidQuery.java
final String timestampResultField = query.getContextValue(GroupByQuery.CTX_TIMESTAMP_RESULT_FIELD);
final boolean hasTimestampResultField = (timestampResultField != null && !timestampResultField.isEmpty()) && query.getContextBoolean(CTX_KEY_OUTERMOST, true) && !query.isApplyLimitPushDown();
int timestampResultFieldIndex = 0;
if (hasTimestampResultField) {
// sql like "group by city_id,time_floor(__time to day)",
// the original translated query is granularity=all and dimensions:[d0, d1]
// the better plan is granularity=day and dimensions:[d0]
// but the ResultRow structure is changed from [d0, d1] to [__time, d0]
// this structure should be fixed as [d0, d1] (actually it is [d0, __time]) before postAggs are called.
//
// the above is the general idea of this optimization.
// but from coding perspective, the granularity=all and "d0" dimension are referenced by many places,
// eg: subtotals, having, grouping set, post agg,
// there would be many many places need to be fixed if "d0" dimension is removed from query.dimensions
// and the same to the granularity change.
// so from easier coding perspective, this optimization is coded as groupby engine-level inner process change.
// the most part of codes are in GroupByStrategyV2 about the process change between broker and compute node.
// the basic logic like nested queries and subtotals are kept unchanged,
// they will still see the granularity=all and the "d0" dimension.
//
// the tradeoff is that GroupByStrategyV2 behaviors differently according to the query contexts set in DruidQuery
// in another word,
// the query generated by "explain plan for select ..." doesn't match to the native query ACTUALLY being executed,
// the granularity and dimensions are slightly different.
// now, part of the query plan logic is handled in GroupByStrategyV2, not only in DruidQuery.toGroupByQuery()
final Granularity timestampResultFieldGranularity = query.getContextValue(GroupByQuery.CTX_TIMESTAMP_RESULT_FIELD_GRANULARITY);
dimensionSpecs = query.getDimensions().stream().filter(dimensionSpec -> !dimensionSpec.getOutputName().equals(timestampResultField)).collect(Collectors.toList());
granularity = timestampResultFieldGranularity;
// when timestampResultField is the last dimension, should set sortByDimsFirst=true,
// otherwise the downstream is sorted by row's timestamp first which makes the final ordering not as expected
timestampResultFieldIndex = query.getContextValue(GroupByQuery.CTX_TIMESTAMP_RESULT_FIELD_INDEX);
if (!query.getContextSortByDimsFirst() && timestampResultFieldIndex == query.getDimensions().size() - 1) {
context.put(GroupByQuery.CTX_KEY_SORT_BY_DIMS_FIRST, true);
}
// it is actually equals to sortByDimsFirst=false
if (query.getContextSortByDimsFirst() && timestampResultFieldIndex == 0) {
context.put(GroupByQuery.CTX_KEY_SORT_BY_DIMS_FIRST, false);
}
// when hasTimestampResultField=true and timestampResultField is neither first nor last dimension,
// the DefaultLimitSpec will always do the reordering
}
final int timestampResultFieldIndexInOriginalDimensions = timestampResultFieldIndex;
if (query.getUniversalTimestamp() != null && !hasTimestampResultField) {
// universalTimestamp works only when granularity is all
// hasTimestampResultField works only when granularity is all
// fudgeTimestamp should not be used when hasTimestampResultField=true due to the row's actual timestamp is used
context.put(CTX_KEY_FUDGE_TIMESTAMP, String.valueOf(query.getUniversalTimestamp().getMillis()));
}
// The having spec shouldn't be passed down, so we need to convey the existing limit push down status
context.put(GroupByQueryConfig.CTX_KEY_APPLY_LIMIT_PUSH_DOWN, query.isApplyLimitPushDown());
// Always request array result rows when passing the query downstream.
context.put(GroupByQueryConfig.CTX_KEY_ARRAY_RESULT_ROWS, true);
final GroupByQuery newQuery = new GroupByQuery(query.getDataSource(), query.getQuerySegmentSpec(), query.getVirtualColumns(), query.getDimFilter(), granularity, dimensionSpecs, query.getAggregatorSpecs(), // Don't apply postaggregators on compute nodes
ImmutableList.of(), // Don't do "having" clause until the end of this method.
null, // higher-up).
query.isApplyLimitPushDown() ? ((DefaultLimitSpec) query.getLimitSpec()).withOffsetToLimit() : null, query.getSubtotalsSpec(), query.getContext()).withOverriddenContext(context.build());
final Sequence<ResultRow> mergedResults = mergingQueryRunner.run(QueryPlus.wrap(newQuery), responseContext);
if (!query.getContextBoolean(CTX_KEY_OUTERMOST, true) || query.getContextBoolean(GroupByQueryConfig.CTX_KEY_EXECUTING_NESTED_QUERY, false)) {
return mergedResults;
} else if (query.getPostAggregatorSpecs().isEmpty()) {
if (!hasTimestampResultField) {
return mergedResults;
}
return Sequences.map(mergedResults, row -> {
final ResultRow resultRow = ResultRow.create(query.getResultRowSizeWithoutPostAggregators());
moveOrReplicateTimestampInRow(query, timestampResultFieldIndexInOriginalDimensions, row, resultRow);
return resultRow;
});
} else {
return Sequences.map(mergedResults, row -> {
// This function's purpose is to apply PostAggregators.
final ResultRow rowWithPostAggregations = ResultRow.create(query.getResultRowSizeWithPostAggregators());
// Copy everything that comes before the postaggregations.
if (hasTimestampResultField) {
moveOrReplicateTimestampInRow(query, timestampResultFieldIndexInOriginalDimensions, row, rowWithPostAggregations);
} else {
for (int i = 0; i < query.getResultRowPostAggregatorStart(); i++) {
rowWithPostAggregations.set(i, row.get(i));
}
}
// Compute postaggregations. We need to do this with a result-row map because PostAggregator.compute
// expects a map. Some further design adjustment may eliminate the need for it, and speed up this function.
final Map<String, Object> mapForPostAggregationComputation = rowWithPostAggregations.toMap(query);
for (int i = 0; i < query.getPostAggregatorSpecs().size(); i++) {
final PostAggregator postAggregator = query.getPostAggregatorSpecs().get(i);
final Object value = postAggregator.compute(mapForPostAggregationComputation);
rowWithPostAggregations.set(query.getResultRowPostAggregatorStart() + i, value);
mapForPostAggregationComputation.put(postAggregator.getName(), value);
}
return rowWithPostAggregations;
});
}
}
use of org.apache.druid.query.groupby.orderby.DefaultLimitSpec in project druid by druid-io.
the class GroupByQueryRunnerTest method testMergeResultsWithOrderBy.
@Test
public void testMergeResultsWithOrderBy() {
LimitSpec[] orderBySpecs = new LimitSpec[] { new DefaultLimitSpec(OrderByColumnSpec.ascending("idx"), null), new DefaultLimitSpec(OrderByColumnSpec.ascending("rows", "idx"), null), new DefaultLimitSpec(OrderByColumnSpec.descending("idx"), null), new DefaultLimitSpec(OrderByColumnSpec.descending("rows", "idx"), null) };
GroupByQuery baseQuery = makeQueryBuilder().setDataSource(QueryRunnerTestHelper.DATA_SOURCE).setInterval("2011-04-02/2011-04-04").setDimensions(new DefaultDimensionSpec("quality", "alias")).setAggregatorSpecs(QueryRunnerTestHelper.ROWS_COUNT, new LongSumAggregatorFactory("idx", "index")).setGranularity(new PeriodGranularity(new Period("P1M"), null, null)).build();
List<ResultRow> allResults = Arrays.asList(makeRow(baseQuery, "2011-04-01", "alias", "automotive", "rows", 2L, "idx", 269L), makeRow(baseQuery, "2011-04-01", "alias", "business", "rows", 2L, "idx", 217L), makeRow(baseQuery, "2011-04-01", "alias", "entertainment", "rows", 2L, "idx", 319L), makeRow(baseQuery, "2011-04-01", "alias", "health", "rows", 2L, "idx", 216L), makeRow(baseQuery, "2011-04-01", "alias", "mezzanine", "rows", 6L, "idx", 4420L), makeRow(baseQuery, "2011-04-01", "alias", "news", "rows", 2L, "idx", 221L), makeRow(baseQuery, "2011-04-01", "alias", "premium", "rows", 6L, "idx", 4416L), makeRow(baseQuery, "2011-04-01", "alias", "technology", "rows", 2L, "idx", 177L), makeRow(baseQuery, "2011-04-01", "alias", "travel", "rows", 2L, "idx", 243L));
final int idxPosition = baseQuery.getResultRowSignature().indexOf("idx");
final int rowsPosition = baseQuery.getResultRowSignature().indexOf("rows");
Comparator<ResultRow> idxComparator = Comparator.comparing(row -> ((Number) row.get(idxPosition)).floatValue());
Comparator<ResultRow> rowsComparator = Comparator.comparing(row -> ((Number) row.get(rowsPosition)).floatValue());
Comparator<ResultRow> rowsIdxComparator = Ordering.from(rowsComparator).thenComparing(idxComparator);
List<List<ResultRow>> expectedResults = Lists.newArrayList(Ordering.from(idxComparator).sortedCopy(allResults), Ordering.from(rowsIdxComparator).sortedCopy(allResults), Ordering.from(idxComparator).reverse().sortedCopy(allResults), Ordering.from(rowsIdxComparator).reverse().sortedCopy(allResults));
for (int i = 0; i < orderBySpecs.length; ++i) {
doTestMergeResultsWithOrderBy(baseQuery, orderBySpecs[i], expectedResults.get(i));
}
}
use of org.apache.druid.query.groupby.orderby.DefaultLimitSpec in project druid by druid-io.
the class GroupByQueryRunnerTest method testVirtualColumnFilterOnInnerQuery.
@Test
public void testVirtualColumnFilterOnInnerQuery() {
GroupByQuery subquery = makeQueryBuilder().setDataSource(QueryRunnerTestHelper.DATA_SOURCE).setQuerySegmentSpec(QueryRunnerTestHelper.FIRST_TO_THIRD).setDimensions(new DefaultDimensionSpec("quality", "alias")).setLimitSpec(new DefaultLimitSpec(Collections.singletonList(new OrderByColumnSpec("alias", OrderByColumnSpec.Direction.DESCENDING)), 12)).setAggregatorSpecs(QueryRunnerTestHelper.ROWS_COUNT, new LongSumAggregatorFactory("idx", "index")).setGranularity(QueryRunnerTestHelper.DAY_GRAN).build();
GroupByQuery query = makeQueryBuilder().setDataSource(subquery).setQuerySegmentSpec(new MultipleIntervalSegmentSpec(ImmutableList.of(Intervals.of("2011-04-01T00:00:00.000Z/2011-04-01T23:58:00.000Z"), Intervals.of("2011-04-02T00:00:00.000Z/2011-04-03T00:00:00.000Z")))).setDimensions(new DefaultDimensionSpec("alias", "alias")).setVirtualColumns(new ExpressionVirtualColumn("v", "case_searched(idx > 1000, 1, 0)", ColumnType.LONG, TestExprMacroTable.INSTANCE)).setDimFilter(new BoundDimFilter("v", "0", null, true, false, null, null, StringComparators.NUMERIC)).setLimitSpec(new DefaultLimitSpec(Collections.singletonList(new OrderByColumnSpec("alias", OrderByColumnSpec.Direction.DESCENDING)), 15)).setAggregatorSpecs(new LongSumAggregatorFactory("rows", "rows"), new LongSumAggregatorFactory("idx", "idx")).setGranularity(QueryRunnerTestHelper.DAY_GRAN).build();
List<ResultRow> expectedResults = Arrays.asList(makeRow(query, "2011-04-01", "alias", "premium", "rows", 3L, "idx", 2900L), makeRow(query, "2011-04-01", "alias", "mezzanine", "rows", 3L, "idx", 2870L), makeRow(query, "2011-04-02", "alias", "premium", "rows", 3L, "idx", 2505L));
Iterable<ResultRow> results = GroupByQueryRunnerTestHelper.runQuery(factory, runner, query);
TestHelper.assertExpectedObjects(expectedResults, results, "virtual column filter on inner query");
}
use of org.apache.druid.query.groupby.orderby.DefaultLimitSpec in project druid by druid-io.
the class GroupByLimitPushDownInsufficientBufferTest method testPartialLimitPushDownMergeForceAggs.
@Test
public void testPartialLimitPushDownMergeForceAggs() {
// one segment's results use limit push down, the other doesn't because of insufficient buffer capacity
QueryToolChest<ResultRow, GroupByQuery> toolChest = groupByFactory.getToolchest();
QueryRunner<ResultRow> theRunner = new FinalizeResultsQueryRunner<>(toolChest.mergeResults(groupByFactory.mergeRunners(executorService, getRunner1())), (QueryToolChest) toolChest);
QueryRunner<ResultRow> theRunner2 = new FinalizeResultsQueryRunner<>(toolChest.mergeResults(tooSmallGroupByFactory.mergeRunners(executorService, getRunner2())), (QueryToolChest) toolChest);
QueryRunner<ResultRow> theRunner3 = new FinalizeResultsQueryRunner<>(toolChest.mergeResults(new QueryRunner<ResultRow>() {
@Override
public Sequence<ResultRow> run(QueryPlus<ResultRow> queryPlus, ResponseContext responseContext) {
return Sequences.simple(ImmutableList.of(theRunner.run(queryPlus, responseContext), theRunner2.run(queryPlus, responseContext))).flatMerge(Function.identity(), queryPlus.getQuery().getResultOrdering());
}
}), (QueryToolChest) toolChest);
QuerySegmentSpec intervalSpec = new MultipleIntervalSegmentSpec(Collections.singletonList(Intervals.utc(0, 1000000)));
GroupByQuery query = GroupByQuery.builder().setDataSource("blah").setQuerySegmentSpec(intervalSpec).setDimensions(new DefaultDimensionSpec("dimA", null)).setAggregatorSpecs(new LongSumAggregatorFactory("metA", "metA")).setLimitSpec(new DefaultLimitSpec(Collections.singletonList(new OrderByColumnSpec("metA", OrderByColumnSpec.Direction.DESCENDING, StringComparators.NUMERIC)), 3)).setGranularity(Granularities.ALL).setContext(ImmutableMap.of(GroupByQueryConfig.CTX_KEY_FORCE_LIMIT_PUSH_DOWN, true)).build();
Sequence<ResultRow> queryResult = theRunner3.run(QueryPlus.wrap(query), ResponseContext.createEmpty());
List<ResultRow> results = queryResult.toList();
ResultRow expectedRow0 = GroupByQueryRunnerTestHelper.createExpectedRow(query, "1970-01-01T00:00:00.000Z", "dimA", "zortaxx", "metA", 999L);
ResultRow expectedRow1 = GroupByQueryRunnerTestHelper.createExpectedRow(query, "1970-01-01T00:00:00.000Z", "dimA", "foo", "metA", 200L);
ResultRow expectedRow2 = GroupByQueryRunnerTestHelper.createExpectedRow(query, "1970-01-01T00:00:00.000Z", "dimA", "mango", "metA", 190L);
Assert.assertEquals(3, results.size());
Assert.assertEquals(expectedRow0, results.get(0));
Assert.assertEquals(expectedRow1, results.get(1));
Assert.assertEquals(expectedRow2, results.get(2));
}
use of org.apache.druid.query.groupby.orderby.DefaultLimitSpec in project druid by druid-io.
the class GroupByQueryQueryToolChestTest method testResultLevelCacheKeyWithLimitSpec.
@Test
public void testResultLevelCacheKeyWithLimitSpec() {
final GroupByQuery query1 = GroupByQuery.builder().setDataSource(QueryRunnerTestHelper.DATA_SOURCE).setQuerySegmentSpec(QueryRunnerTestHelper.FIRST_TO_THIRD).setDimensions(new DefaultDimensionSpec("quality", "alias")).setAggregatorSpecs(QueryRunnerTestHelper.ROWS_COUNT, new LongSumAggregatorFactory("idx", "index")).setPostAggregatorSpecs(ImmutableList.of(new ExpressionPostAggregator("post", "alias + 'x'", null, TestExprMacroTable.INSTANCE))).setGranularity(QueryRunnerTestHelper.DAY_GRAN).setLimitSpec(new DefaultLimitSpec(ImmutableList.of(new OrderByColumnSpec("post", OrderByColumnSpec.Direction.DESCENDING)), Integer.MAX_VALUE)).build();
final GroupByQuery query2 = GroupByQuery.builder().setDataSource(QueryRunnerTestHelper.DATA_SOURCE).setQuerySegmentSpec(QueryRunnerTestHelper.FIRST_TO_THIRD).setDimensions(new DefaultDimensionSpec("quality", "alias")).setAggregatorSpecs(QueryRunnerTestHelper.ROWS_COUNT, new LongSumAggregatorFactory("idx", "index")).setPostAggregatorSpecs(ImmutableList.of(new ExpressionPostAggregator("post", "alias - 'x'", null, TestExprMacroTable.INSTANCE))).setGranularity(QueryRunnerTestHelper.DAY_GRAN).setLimitSpec(new DefaultLimitSpec(ImmutableList.of(new OrderByColumnSpec("post", OrderByColumnSpec.Direction.DESCENDING)), Integer.MAX_VALUE)).build();
final CacheStrategy<ResultRow, Object, GroupByQuery> strategy1 = new GroupByQueryQueryToolChest(null).getCacheStrategy(query1);
final CacheStrategy<ResultRow, Object, GroupByQuery> strategy2 = new GroupByQueryQueryToolChest(null).getCacheStrategy(query2);
Assert.assertTrue(Arrays.equals(strategy1.computeCacheKey(query1), strategy2.computeCacheKey(query2)));
Assert.assertFalse(Arrays.equals(strategy1.computeResultLevelCacheKey(query1), strategy2.computeResultLevelCacheKey(query2)));
}
Aggregations