use of org.apache.druid.query.aggregation.AggregatorFactory in project druid by druid-io.
the class ResultRow method fromLegacyRow.
/**
* Create a row based on a legacy {@link Row} that was generated by a given {@link GroupByQuery}. This is useful
* for deserializing rows that have come off the wire in the older format. (In the past, GroupBy query results
* were sequences of {@link Row}, not ResultRow.)
*
* @param row legacy row
* @param query query corresponding to the output ResultRow
*/
public static ResultRow fromLegacyRow(Row row, final GroupByQuery query) {
// Can't be sure if we'll get result rows with or without postaggregations, so be safe.
final ResultRow resultRow = ResultRow.create(query.getResultRowSizeWithPostAggregators());
int i = 0;
if (query.getResultRowHasTimestamp()) {
resultRow.set(i++, row.getTimestamp().getMillis());
}
for (DimensionSpec dimensionSpec : query.getDimensions()) {
resultRow.set(i++, row.getRaw(dimensionSpec.getOutputName()));
}
for (AggregatorFactory aggregatorFactory : query.getAggregatorSpecs()) {
resultRow.set(i++, row.getRaw(aggregatorFactory.getName()));
}
for (PostAggregator postAggregator : query.getPostAggregatorSpecs()) {
resultRow.set(i++, row.getRaw(postAggregator.getName()));
}
return resultRow;
}
use of org.apache.druid.query.aggregation.AggregatorFactory in project druid by druid-io.
the class GroupByQueryHelper method toResultRow.
public static ResultRow toResultRow(final GroupByQuery query, final Row row) {
final ResultRow resultRow = ResultRow.create(query.getResultRowSizeWithPostAggregators());
int i = 0;
if (query.getResultRowHasTimestamp()) {
resultRow.set(i++, row.getTimestampFromEpoch());
}
for (DimensionSpec dimensionSpec : query.getDimensions()) {
resultRow.set(i++, row.getRaw(dimensionSpec.getOutputName()));
}
for (AggregatorFactory aggregatorFactory : query.getAggregatorSpecs()) {
resultRow.set(i++, row.getRaw(aggregatorFactory.getName()));
}
for (PostAggregator postAggregator : query.getPostAggregatorSpecs()) {
resultRow.set(i++, row.getRaw(postAggregator.getName()));
}
return resultRow;
}
use of org.apache.druid.query.aggregation.AggregatorFactory in project druid by druid-io.
the class RowBasedGrouperHelper method createGrouperAccumulatorPair.
/**
* Create a {@link Grouper} that groups according to the dimensions and aggregators in "query", along with
* an {@link Accumulator} that accepts ResultRows and forwards them to the grouper.
*
* The pair will operate in one of two modes:
*
* 1) Combining mode (used if "subquery" is null). In this mode, filters from the "query" are ignored, and
* its aggregators are converted into combining form. The input ResultRows are assumed to be partially-grouped
* results originating from the provided "query".
*
* 2) Subquery mode (used if "subquery" is nonnull). In this mode, filters from the "query" (both intervals
* and dim filters) are respected, and its aggregators are used in standard (not combining) form. The input
* ResultRows are assumed to be results originating from the provided "subquery".
*
* @param query query that we are grouping for
* @param subquery optional subquery that we are receiving results from (see combining vs. subquery
* mode above)
* @param config groupBy query config
* @param bufferSupplier supplier of merge buffers
* @param combineBufferHolder holder of combine buffers. Unused if concurrencyHint = -1, and may be null in that case
* @param concurrencyHint -1 for single-threaded Grouper, >=1 for concurrent Grouper
* @param temporaryStorage temporary storage used for spilling from the Grouper
* @param spillMapper object mapper used for spilling from the Grouper
* @param grouperSorter executor service used for parallel combining. Unused if concurrencyHint = -1, and may
* be null in that case
* @param priority query priority
* @param hasQueryTimeout whether or not this query has a timeout
* @param queryTimeoutAt when this query times out, in milliseconds since the epoch
* @param mergeBufferSize size of the merge buffers from "bufferSupplier"
*/
public static Pair<Grouper<RowBasedKey>, Accumulator<AggregateResult, ResultRow>> createGrouperAccumulatorPair(final GroupByQuery query, @Nullable final GroupByQuery subquery, final GroupByQueryConfig config, final Supplier<ByteBuffer> bufferSupplier, @Nullable final ReferenceCountingResourceHolder<ByteBuffer> combineBufferHolder, final int concurrencyHint, final LimitedTemporaryStorage temporaryStorage, final ObjectMapper spillMapper, @Nullable final ListeningExecutorService grouperSorter, final int priority, final boolean hasQueryTimeout, final long queryTimeoutAt, final int mergeBufferSize) {
// concurrencyHint >= 1 for concurrent groupers, -1 for single-threaded
Preconditions.checkArgument(concurrencyHint >= 1 || concurrencyHint == -1, "invalid concurrencyHint");
if (concurrencyHint >= 1) {
Preconditions.checkNotNull(grouperSorter, "grouperSorter executor must be provided");
}
// See method-level javadoc; we go into combining mode if there is no subquery.
final boolean combining = subquery == null;
final List<ColumnType> valueTypes = DimensionHandlerUtils.getValueTypesFromDimensionSpecs(query.getDimensions());
final GroupByQueryConfig querySpecificConfig = config.withOverrides(query);
final boolean includeTimestamp = query.getResultRowHasTimestamp();
final ThreadLocal<ResultRow> columnSelectorRow = new ThreadLocal<>();
ColumnSelectorFactory columnSelectorFactory = createResultRowBasedColumnSelectorFactory(combining ? query : subquery, columnSelectorRow::get, RowSignature.Finalization.UNKNOWN);
// Apply virtual columns if we are in subquery (non-combining) mode.
if (!combining) {
columnSelectorFactory = query.getVirtualColumns().wrap(columnSelectorFactory);
}
final boolean willApplyLimitPushDown = query.isApplyLimitPushDown();
final DefaultLimitSpec limitSpec = willApplyLimitPushDown ? (DefaultLimitSpec) query.getLimitSpec() : null;
boolean sortHasNonGroupingFields = false;
if (willApplyLimitPushDown) {
sortHasNonGroupingFields = DefaultLimitSpec.sortingOrderHasNonGroupingFields(limitSpec, query.getDimensions());
}
final AggregatorFactory[] aggregatorFactories;
if (combining) {
aggregatorFactories = query.getAggregatorSpecs().stream().map(AggregatorFactory::getCombiningFactory).toArray(AggregatorFactory[]::new);
} else {
aggregatorFactories = query.getAggregatorSpecs().toArray(new AggregatorFactory[0]);
}
final Grouper.KeySerdeFactory<RowBasedKey> keySerdeFactory = new RowBasedKeySerdeFactory(includeTimestamp, query.getContextSortByDimsFirst(), query.getDimensions(), querySpecificConfig.getMaxMergingDictionarySize() / (concurrencyHint == -1 ? 1 : concurrencyHint), valueTypes, aggregatorFactories, limitSpec);
final Grouper<RowBasedKey> grouper;
if (concurrencyHint == -1) {
grouper = new SpillingGrouper<>(bufferSupplier, keySerdeFactory, columnSelectorFactory, aggregatorFactories, querySpecificConfig.getBufferGrouperMaxSize(), querySpecificConfig.getBufferGrouperMaxLoadFactor(), querySpecificConfig.getBufferGrouperInitialBuckets(), temporaryStorage, spillMapper, true, limitSpec, sortHasNonGroupingFields, mergeBufferSize);
} else {
final Grouper.KeySerdeFactory<RowBasedKey> combineKeySerdeFactory = new RowBasedKeySerdeFactory(includeTimestamp, query.getContextSortByDimsFirst(), query.getDimensions(), // use entire dictionary space for combining key serde
querySpecificConfig.getMaxMergingDictionarySize(), valueTypes, aggregatorFactories, limitSpec);
grouper = new ConcurrentGrouper<>(querySpecificConfig, bufferSupplier, combineBufferHolder, keySerdeFactory, combineKeySerdeFactory, columnSelectorFactory, aggregatorFactories, temporaryStorage, spillMapper, concurrencyHint, limitSpec, sortHasNonGroupingFields, grouperSorter, priority, hasQueryTimeout, queryTimeoutAt);
}
final int keySize = includeTimestamp ? query.getDimensions().size() + 1 : query.getDimensions().size();
final ValueExtractFunction valueExtractFn = makeValueExtractFunction(query, combining, includeTimestamp, columnSelectorFactory, valueTypes);
final Predicate<ResultRow> rowPredicate;
if (combining) {
// Filters are not applied in combining mode.
rowPredicate = row -> true;
} else {
rowPredicate = getResultRowPredicate(query, subquery);
}
final Accumulator<AggregateResult, ResultRow> accumulator = (priorResult, row) -> {
BaseQuery.checkInterrupted();
if (priorResult != null && !priorResult.isOk()) {
// Pass-through error returns without doing more work.
return priorResult;
}
if (!grouper.isInitialized()) {
grouper.init();
}
if (!rowPredicate.test(row)) {
return AggregateResult.ok();
}
columnSelectorRow.set(row);
final Comparable[] key = new Comparable[keySize];
valueExtractFn.apply(row, key);
final AggregateResult aggregateResult = grouper.aggregate(new RowBasedKey(key));
columnSelectorRow.set(null);
return aggregateResult;
};
return new Pair<>(grouper, accumulator);
}
use of org.apache.druid.query.aggregation.AggregatorFactory in project druid by druid-io.
the class GroupByStrategyV1 method processSubqueryResult.
@Override
public Sequence<ResultRow> processSubqueryResult(GroupByQuery subquery, GroupByQuery query, GroupByQueryResource resource, Sequence<ResultRow> subqueryResult, boolean wasQueryPushedDown) {
final Set<AggregatorFactory> aggs = new HashSet<>();
// Nested group-bys work by first running the inner query and then materializing the results in an incremental
// index which the outer query is then run against. To build the incremental index, we use the fieldNames from
// the aggregators for the outer query to define the column names so that the index will match the query. If
// there are multiple types of aggregators in the outer query referencing the same fieldName, we will try to build
// multiple columns of the same name using different aggregator types and will fail. Here, we permit multiple
// aggregators of the same type referencing the same fieldName (and skip creating identical columns for the
// subsequent ones) and return an error if the aggregator types are different.
final Set<String> dimensionNames = new HashSet<>();
for (DimensionSpec dimension : subquery.getDimensions()) {
dimensionNames.add(dimension.getOutputName());
}
for (AggregatorFactory aggregatorFactory : query.getAggregatorSpecs()) {
for (final AggregatorFactory transferAgg : aggregatorFactory.getRequiredColumns()) {
if (dimensionNames.contains(transferAgg.getName())) {
// doesn't have this problem.
continue;
}
if (Iterables.any(aggs, new Predicate<AggregatorFactory>() {
@Override
public boolean apply(AggregatorFactory agg) {
return agg.getName().equals(transferAgg.getName()) && !agg.equals(transferAgg);
}
})) {
throw new IAE("Inner aggregator can currently only be referenced by a single type of outer aggregator" + " for '%s'", transferAgg.getName());
}
aggs.add(transferAgg);
}
}
// We need the inner incremental index to have all the columns required by the outer query
final GroupByQuery innerQuery = new GroupByQuery.Builder(subquery).setAggregatorSpecs(ImmutableList.copyOf(aggs)).setInterval(subquery.getIntervals()).setPostAggregatorSpecs(new ArrayList<>()).build();
final GroupByQuery outerQuery = new GroupByQuery.Builder(query).setLimitSpec(query.getLimitSpec().merge(subquery.getLimitSpec())).build();
final IncrementalIndex innerQueryResultIndex = GroupByQueryHelper.makeIncrementalIndex(innerQuery.withOverriddenContext(ImmutableMap.of(GroupByQueryHelper.CTX_KEY_SORT_RESULTS, true)), subquery, configSupplier.get(), subqueryResult);
// Outer query might have multiple intervals, but they are expected to be non-overlapping and sorted which
// is ensured by QuerySegmentSpec.
// GroupByQueryEngine can only process one interval at a time, so we need to call it once per interval
// and concatenate the results.
final IncrementalIndex outerQueryResultIndex = GroupByQueryHelper.makeIncrementalIndex(outerQuery, null, configSupplier.get(), Sequences.concat(Sequences.map(Sequences.simple(outerQuery.getIntervals()), new Function<Interval, Sequence<ResultRow>>() {
@Override
public Sequence<ResultRow> apply(Interval interval) {
return process(outerQuery.withQuerySegmentSpec(new MultipleIntervalSegmentSpec(ImmutableList.of(interval))), new IncrementalIndexStorageAdapter(innerQueryResultIndex));
}
})));
innerQueryResultIndex.close();
return Sequences.withBaggage(outerQuery.postProcess(GroupByQueryHelper.postAggregate(query, outerQueryResultIndex)), outerQueryResultIndex);
}
use of org.apache.druid.query.aggregation.AggregatorFactory in project druid by druid-io.
the class DefaultLimitSpec method build.
@Override
public Function<Sequence<ResultRow>, Sequence<ResultRow>> build(final GroupByQuery query) {
final List<DimensionSpec> dimensions = query.getDimensions();
// Can avoid re-sorting if the natural ordering is good enough.
boolean sortingNeeded = dimensions.size() < columns.size();
final Set<String> aggAndPostAggNames = new HashSet<>();
for (AggregatorFactory agg : query.getAggregatorSpecs()) {
aggAndPostAggNames.add(agg.getName());
}
for (PostAggregator postAgg : query.getPostAggregatorSpecs()) {
aggAndPostAggNames.add(postAgg.getName());
}
if (!sortingNeeded) {
for (int i = 0; i < columns.size(); i++) {
final OrderByColumnSpec columnSpec = columns.get(i);
if (aggAndPostAggNames.contains(columnSpec.getDimension())) {
sortingNeeded = true;
break;
}
final ColumnType columnType = getOrderByType(columnSpec, dimensions);
final StringComparator naturalComparator;
if (columnType.is(ValueType.STRING)) {
naturalComparator = StringComparators.LEXICOGRAPHIC;
} else if (columnType.isNumeric()) {
naturalComparator = StringComparators.NUMERIC;
} else if (columnType.isArray()) {
if (columnType.getElementType().isNumeric()) {
naturalComparator = StringComparators.NUMERIC;
} else {
naturalComparator = StringComparators.LEXICOGRAPHIC;
}
} else {
sortingNeeded = true;
break;
}
if (columnSpec.getDirection() != OrderByColumnSpec.Direction.ASCENDING || !columnSpec.getDimensionComparator().equals(naturalComparator) || !columnSpec.getDimension().equals(dimensions.get(i).getOutputName())) {
sortingNeeded = true;
break;
}
}
}
if (!sortingNeeded) {
// If granularity is ALL, sortByDimsFirst doesn't change the sorting order.
sortingNeeded = !query.getGranularity().equals(Granularities.ALL) && query.getContextSortByDimsFirst();
}
if (!sortingNeeded) {
String timestampField = query.getContextValue(GroupByQuery.CTX_TIMESTAMP_RESULT_FIELD);
if (timestampField != null && !timestampField.isEmpty()) {
int timestampResultFieldIndex = query.getContextValue(GroupByQuery.CTX_TIMESTAMP_RESULT_FIELD_INDEX);
sortingNeeded = query.getContextSortByDimsFirst() ? timestampResultFieldIndex != query.getDimensions().size() - 1 : timestampResultFieldIndex != 0;
}
}
final Function<Sequence<ResultRow>, Sequence<ResultRow>> sortAndLimitFn;
if (sortingNeeded) {
// Materialize the Comparator first for fast-fail error checking.
final Ordering<ResultRow> ordering = makeComparator(query.getResultRowSignature(), query.getResultRowHasTimestamp(), query.getDimensions(), query.getAggregatorSpecs(), query.getPostAggregatorSpecs(), query.getContextSortByDimsFirst());
// underlying data isn't changing. (Useful for query reproducibility and offset-based pagination.)
if (isLimited()) {
sortAndLimitFn = results -> new TopNSequence<>(results, ordering, limit + offset);
} else {
sortAndLimitFn = results -> Sequences.sort(results, ordering).limit(limit + offset);
}
} else {
if (isLimited()) {
sortAndLimitFn = results -> results.limit(limit + offset);
} else {
sortAndLimitFn = Functions.identity();
}
}
// Finally, apply offset after sorting and limiting.
if (isOffset()) {
return results -> sortAndLimitFn.apply(results).skip(offset);
} else {
return sortAndLimitFn;
}
}
Aggregations