use of org.apache.druid.query.dimension.DimensionSpec in project druid by druid-io.
the class MovingAverageIterableTest method testWithFilteredAggregation.
@Test
public void testWithFilteredAggregation() {
Map<String, Object> event1 = new HashMap<>();
Map<String, Object> event2 = new HashMap<>();
List<DimensionSpec> ds = new ArrayList<>();
ds.add(new DefaultDimensionSpec("gender", "gender"));
event1.put("gender", "m");
event1.put("pageViews", 10L);
Row row1 = new MapBasedRow(JAN_1, event1);
event2.put("gender", "m");
event2.put("pageViews", 20L);
Row row2 = new MapBasedRow(JAN_4, event2);
Sequence<RowBucket> seq = Sequences.simple(Arrays.asList(new RowBucket(JAN_1, Collections.singletonList(row1)), new RowBucket(JAN_2, Collections.emptyList()), new RowBucket(JAN_3, Collections.emptyList()), new RowBucket(JAN_4, Collections.singletonList(row2))));
AveragerFactory averagerfactory = new LongMeanAveragerFactory("movingAvgPageViews", 4, 1, "pageViews");
AggregatorFactory aggregatorFactory = new LongSumAggregatorFactory("pageViews", "pageViews");
DimFilter filter = new SelectorDimFilter("gender", "m", null);
FilteredAggregatorFactory filteredAggregatorFactory = new FilteredAggregatorFactory(aggregatorFactory, filter);
Iterator<Row> iter = new MovingAverageIterable(seq, ds, Collections.singletonList(averagerfactory), Collections.emptyList(), Collections.singletonList(filteredAggregatorFactory)).iterator();
Assert.assertTrue(iter.hasNext());
Row result = iter.next();
Assert.assertEquals("m", (result.getDimension("gender")).get(0));
Assert.assertEquals(2.5f, result.getMetric("movingAvgPageViews").floatValue(), 0.0f);
Assert.assertTrue(iter.hasNext());
result = iter.next();
Assert.assertEquals("m", (result.getDimension("gender")).get(0));
Assert.assertEquals(2.5f, result.getMetric("movingAvgPageViews").floatValue(), 0.0f);
Assert.assertTrue(iter.hasNext());
result = iter.next();
Assert.assertEquals("m", (result.getDimension("gender")).get(0));
Assert.assertEquals(2.5f, result.getMetric("movingAvgPageViews").floatValue(), 0.0f);
Assert.assertTrue(iter.hasNext());
result = iter.next();
Assert.assertEquals("m", (result.getDimension("gender")).get(0));
Assert.assertEquals(7.5f, result.getMetric("movingAvgPageViews").floatValue(), 0.0f);
Assert.assertFalse(iter.hasNext());
}
use of org.apache.druid.query.dimension.DimensionSpec in project druid by druid-io.
the class HllSketchBaseSqlAggregator method toDruidAggregation.
@Nullable
@Override
public Aggregation toDruidAggregation(PlannerContext plannerContext, RowSignature rowSignature, VirtualColumnRegistry virtualColumnRegistry, RexBuilder rexBuilder, String name, AggregateCall aggregateCall, Project project, List<Aggregation> existingAggregations, boolean finalizeAggregations) {
// Don't use Aggregations.getArgumentsForSimpleAggregator, since it won't let us use direct column access
// for string columns.
final RexNode columnRexNode = Expressions.fromFieldAccess(rowSignature, project, aggregateCall.getArgList().get(0));
final DruidExpression columnArg = Expressions.toDruidExpression(plannerContext, rowSignature, columnRexNode);
if (columnArg == null) {
return null;
}
final int logK;
if (aggregateCall.getArgList().size() >= 2) {
final RexNode logKarg = Expressions.fromFieldAccess(rowSignature, project, aggregateCall.getArgList().get(1));
if (!logKarg.isA(SqlKind.LITERAL)) {
// logK must be a literal in order to plan.
return null;
}
logK = ((Number) RexLiteral.value(logKarg)).intValue();
} else {
logK = HllSketchAggregatorFactory.DEFAULT_LG_K;
}
final String tgtHllType;
if (aggregateCall.getArgList().size() >= 3) {
final RexNode tgtHllTypeArg = Expressions.fromFieldAccess(rowSignature, project, aggregateCall.getArgList().get(2));
if (!tgtHllTypeArg.isA(SqlKind.LITERAL)) {
// tgtHllType must be a literal in order to plan.
return null;
}
tgtHllType = RexLiteral.stringValue(tgtHllTypeArg);
} else {
tgtHllType = HllSketchAggregatorFactory.DEFAULT_TGT_HLL_TYPE.name();
}
final AggregatorFactory aggregatorFactory;
final String aggregatorName = finalizeAggregations ? Calcites.makePrefixedName(name, "a") : name;
if (columnArg.isDirectColumnAccess() && rowSignature.getColumnType(columnArg.getDirectColumn()).map(type -> type.is(ValueType.COMPLEX)).orElse(false)) {
aggregatorFactory = new HllSketchMergeAggregatorFactory(aggregatorName, columnArg.getDirectColumn(), logK, tgtHllType, ROUND);
} else {
final RelDataType dataType = columnRexNode.getType();
final ColumnType inputType = Calcites.getColumnTypeForRelDataType(dataType);
if (inputType == null) {
throw new ISE("Cannot translate sqlTypeName[%s] to Druid type for field[%s]", dataType.getSqlTypeName(), aggregatorName);
}
final DimensionSpec dimensionSpec;
if (columnArg.isDirectColumnAccess()) {
dimensionSpec = columnArg.getSimpleExtraction().toDimensionSpec(null, inputType);
} else {
String virtualColumnName = virtualColumnRegistry.getOrCreateVirtualColumnForExpression(columnArg, dataType);
dimensionSpec = new DefaultDimensionSpec(virtualColumnName, null, inputType);
}
aggregatorFactory = new HllSketchBuildAggregatorFactory(aggregatorName, dimensionSpec.getDimension(), logK, tgtHllType, ROUND);
}
return toAggregation(name, finalizeAggregations, aggregatorFactory);
}
use of org.apache.druid.query.dimension.DimensionSpec in project druid by druid-io.
the class ThetaSketchBaseSqlAggregator method toDruidAggregation.
@Nullable
@Override
public Aggregation toDruidAggregation(PlannerContext plannerContext, RowSignature rowSignature, VirtualColumnRegistry virtualColumnRegistry, RexBuilder rexBuilder, String name, AggregateCall aggregateCall, Project project, List<Aggregation> existingAggregations, boolean finalizeAggregations) {
// Don't use Aggregations.getArgumentsForSimpleAggregator, since it won't let us use direct column access
// for string columns.
final RexNode columnRexNode = Expressions.fromFieldAccess(rowSignature, project, aggregateCall.getArgList().get(0));
final DruidExpression columnArg = Expressions.toDruidExpression(plannerContext, rowSignature, columnRexNode);
if (columnArg == null) {
return null;
}
final int sketchSize;
if (aggregateCall.getArgList().size() >= 2) {
final RexNode sketchSizeArg = Expressions.fromFieldAccess(rowSignature, project, aggregateCall.getArgList().get(1));
if (!sketchSizeArg.isA(SqlKind.LITERAL)) {
// logK must be a literal in order to plan.
return null;
}
sketchSize = ((Number) RexLiteral.value(sketchSizeArg)).intValue();
} else {
sketchSize = SketchAggregatorFactory.DEFAULT_MAX_SKETCH_SIZE;
}
final AggregatorFactory aggregatorFactory;
final String aggregatorName = finalizeAggregations ? Calcites.makePrefixedName(name, "a") : name;
if (columnArg.isDirectColumnAccess() && rowSignature.getColumnType(columnArg.getDirectColumn()).map(type -> type.is(ValueType.COMPLEX)).orElse(false)) {
aggregatorFactory = new SketchMergeAggregatorFactory(aggregatorName, columnArg.getDirectColumn(), sketchSize, null, null, null);
} else {
final RelDataType dataType = columnRexNode.getType();
final ColumnType inputType = Calcites.getColumnTypeForRelDataType(dataType);
if (inputType == null) {
throw new ISE("Cannot translate sqlTypeName[%s] to Druid type for field[%s]", dataType.getSqlTypeName(), aggregatorName);
}
final DimensionSpec dimensionSpec;
if (columnArg.isDirectColumnAccess()) {
dimensionSpec = columnArg.getSimpleExtraction().toDimensionSpec(null, inputType);
} else {
String virtualColumnName = virtualColumnRegistry.getOrCreateVirtualColumnForExpression(columnArg, dataType);
dimensionSpec = new DefaultDimensionSpec(virtualColumnName, null, inputType);
}
aggregatorFactory = new SketchMergeAggregatorFactory(aggregatorName, dimensionSpec.getDimension(), sketchSize, null, null, null);
}
return toAggregation(name, finalizeAggregations, aggregatorFactory);
}
use of org.apache.druid.query.dimension.DimensionSpec in project druid by druid-io.
the class BloomFilterSqlAggregator method toDruidAggregation.
@Nullable
@Override
public Aggregation toDruidAggregation(PlannerContext plannerContext, RowSignature rowSignature, VirtualColumnRegistry virtualColumnRegistry, RexBuilder rexBuilder, String name, AggregateCall aggregateCall, Project project, List<Aggregation> existingAggregations, boolean finalizeAggregations) {
final RexNode inputOperand = Expressions.fromFieldAccess(rowSignature, project, aggregateCall.getArgList().get(0));
final DruidExpression input = Expressions.toDruidExpression(plannerContext, rowSignature, inputOperand);
if (input == null) {
return null;
}
final AggregatorFactory aggregatorFactory;
final String aggName = StringUtils.format("%s:agg", name);
final RexNode maxNumEntriesOperand = Expressions.fromFieldAccess(rowSignature, project, aggregateCall.getArgList().get(1));
if (!maxNumEntriesOperand.isA(SqlKind.LITERAL)) {
// maxNumEntriesOperand must be a literal in order to plan.
return null;
}
final int maxNumEntries = ((Number) RexLiteral.value(maxNumEntriesOperand)).intValue();
// Look for existing matching aggregatorFactory.
for (final Aggregation existing : existingAggregations) {
for (AggregatorFactory factory : existing.getAggregatorFactories()) {
if (factory instanceof BloomFilterAggregatorFactory) {
final BloomFilterAggregatorFactory theFactory = (BloomFilterAggregatorFactory) factory;
// Check input for equivalence.
final boolean inputMatches;
final DruidExpression virtualInput = virtualColumnRegistry.findVirtualColumnExpressions(theFactory.requiredFields()).stream().findFirst().orElse(null);
if (virtualInput == null) {
if (input.isDirectColumnAccess()) {
inputMatches = input.getDirectColumn().equals(theFactory.getField().getDimension());
} else {
inputMatches = input.getSimpleExtraction().getColumn().equals(theFactory.getField().getDimension()) && input.getSimpleExtraction().getExtractionFn().equals(theFactory.getField().getExtractionFn());
}
} else {
inputMatches = virtualInput.equals(input);
}
final boolean matches = inputMatches && theFactory.getMaxNumEntries() == maxNumEntries;
if (matches) {
// Found existing one. Use this.
return Aggregation.create(theFactory);
}
}
}
}
// No existing match found. Create a new one.
ColumnType valueType = Calcites.getColumnTypeForRelDataType(inputOperand.getType());
final DimensionSpec spec;
if (input.isDirectColumnAccess()) {
spec = new DefaultDimensionSpec(input.getSimpleExtraction().getColumn(), StringUtils.format("%s:%s", name, input.getSimpleExtraction().getColumn()), valueType);
} else if (input.isSimpleExtraction()) {
spec = new ExtractionDimensionSpec(input.getSimpleExtraction().getColumn(), StringUtils.format("%s:%s", name, input.getSimpleExtraction().getColumn()), valueType, input.getSimpleExtraction().getExtractionFn());
} else {
String virtualColumnName = virtualColumnRegistry.getOrCreateVirtualColumnForExpression(input, inputOperand.getType());
spec = new DefaultDimensionSpec(virtualColumnName, StringUtils.format("%s:%s", name, virtualColumnName));
}
aggregatorFactory = new BloomFilterAggregatorFactory(aggName, spec, maxNumEntries);
return Aggregation.create(aggregatorFactory);
}
use of org.apache.druid.query.dimension.DimensionSpec in project druid by druid-io.
the class GroupByStrategyV2 method mergeResults.
@Override
public Sequence<ResultRow> mergeResults(final QueryRunner<ResultRow> baseRunner, final GroupByQuery query, final ResponseContext responseContext) {
// Merge streams using ResultMergeQueryRunner, then apply postaggregators, then apply limit (which may
// involve materialization)
final ResultMergeQueryRunner<ResultRow> mergingQueryRunner = new ResultMergeQueryRunner<>(baseRunner, this::createResultComparator, this::createMergeFn);
// Set up downstream context.
final ImmutableMap.Builder<String, Object> context = ImmutableMap.builder();
context.put("finalize", false);
context.put(GroupByQueryConfig.CTX_KEY_STRATEGY, GroupByStrategySelector.STRATEGY_V2);
context.put(CTX_KEY_OUTERMOST, false);
Granularity granularity = query.getGranularity();
List<DimensionSpec> dimensionSpecs = query.getDimensions();
// the CTX_TIMESTAMP_RESULT_FIELD is set in DruidQuery.java
final String timestampResultField = query.getContextValue(GroupByQuery.CTX_TIMESTAMP_RESULT_FIELD);
final boolean hasTimestampResultField = (timestampResultField != null && !timestampResultField.isEmpty()) && query.getContextBoolean(CTX_KEY_OUTERMOST, true) && !query.isApplyLimitPushDown();
int timestampResultFieldIndex = 0;
if (hasTimestampResultField) {
// sql like "group by city_id,time_floor(__time to day)",
// the original translated query is granularity=all and dimensions:[d0, d1]
// the better plan is granularity=day and dimensions:[d0]
// but the ResultRow structure is changed from [d0, d1] to [__time, d0]
// this structure should be fixed as [d0, d1] (actually it is [d0, __time]) before postAggs are called.
//
// the above is the general idea of this optimization.
// but from coding perspective, the granularity=all and "d0" dimension are referenced by many places,
// eg: subtotals, having, grouping set, post agg,
// there would be many many places need to be fixed if "d0" dimension is removed from query.dimensions
// and the same to the granularity change.
// so from easier coding perspective, this optimization is coded as groupby engine-level inner process change.
// the most part of codes are in GroupByStrategyV2 about the process change between broker and compute node.
// the basic logic like nested queries and subtotals are kept unchanged,
// they will still see the granularity=all and the "d0" dimension.
//
// the tradeoff is that GroupByStrategyV2 behaviors differently according to the query contexts set in DruidQuery
// in another word,
// the query generated by "explain plan for select ..." doesn't match to the native query ACTUALLY being executed,
// the granularity and dimensions are slightly different.
// now, part of the query plan logic is handled in GroupByStrategyV2, not only in DruidQuery.toGroupByQuery()
final Granularity timestampResultFieldGranularity = query.getContextValue(GroupByQuery.CTX_TIMESTAMP_RESULT_FIELD_GRANULARITY);
dimensionSpecs = query.getDimensions().stream().filter(dimensionSpec -> !dimensionSpec.getOutputName().equals(timestampResultField)).collect(Collectors.toList());
granularity = timestampResultFieldGranularity;
// when timestampResultField is the last dimension, should set sortByDimsFirst=true,
// otherwise the downstream is sorted by row's timestamp first which makes the final ordering not as expected
timestampResultFieldIndex = query.getContextValue(GroupByQuery.CTX_TIMESTAMP_RESULT_FIELD_INDEX);
if (!query.getContextSortByDimsFirst() && timestampResultFieldIndex == query.getDimensions().size() - 1) {
context.put(GroupByQuery.CTX_KEY_SORT_BY_DIMS_FIRST, true);
}
// it is actually equals to sortByDimsFirst=false
if (query.getContextSortByDimsFirst() && timestampResultFieldIndex == 0) {
context.put(GroupByQuery.CTX_KEY_SORT_BY_DIMS_FIRST, false);
}
// when hasTimestampResultField=true and timestampResultField is neither first nor last dimension,
// the DefaultLimitSpec will always do the reordering
}
final int timestampResultFieldIndexInOriginalDimensions = timestampResultFieldIndex;
if (query.getUniversalTimestamp() != null && !hasTimestampResultField) {
// universalTimestamp works only when granularity is all
// hasTimestampResultField works only when granularity is all
// fudgeTimestamp should not be used when hasTimestampResultField=true due to the row's actual timestamp is used
context.put(CTX_KEY_FUDGE_TIMESTAMP, String.valueOf(query.getUniversalTimestamp().getMillis()));
}
// The having spec shouldn't be passed down, so we need to convey the existing limit push down status
context.put(GroupByQueryConfig.CTX_KEY_APPLY_LIMIT_PUSH_DOWN, query.isApplyLimitPushDown());
// Always request array result rows when passing the query downstream.
context.put(GroupByQueryConfig.CTX_KEY_ARRAY_RESULT_ROWS, true);
final GroupByQuery newQuery = new GroupByQuery(query.getDataSource(), query.getQuerySegmentSpec(), query.getVirtualColumns(), query.getDimFilter(), granularity, dimensionSpecs, query.getAggregatorSpecs(), // Don't apply postaggregators on compute nodes
ImmutableList.of(), // Don't do "having" clause until the end of this method.
null, // higher-up).
query.isApplyLimitPushDown() ? ((DefaultLimitSpec) query.getLimitSpec()).withOffsetToLimit() : null, query.getSubtotalsSpec(), query.getContext()).withOverriddenContext(context.build());
final Sequence<ResultRow> mergedResults = mergingQueryRunner.run(QueryPlus.wrap(newQuery), responseContext);
if (!query.getContextBoolean(CTX_KEY_OUTERMOST, true) || query.getContextBoolean(GroupByQueryConfig.CTX_KEY_EXECUTING_NESTED_QUERY, false)) {
return mergedResults;
} else if (query.getPostAggregatorSpecs().isEmpty()) {
if (!hasTimestampResultField) {
return mergedResults;
}
return Sequences.map(mergedResults, row -> {
final ResultRow resultRow = ResultRow.create(query.getResultRowSizeWithoutPostAggregators());
moveOrReplicateTimestampInRow(query, timestampResultFieldIndexInOriginalDimensions, row, resultRow);
return resultRow;
});
} else {
return Sequences.map(mergedResults, row -> {
// This function's purpose is to apply PostAggregators.
final ResultRow rowWithPostAggregations = ResultRow.create(query.getResultRowSizeWithPostAggregators());
// Copy everything that comes before the postaggregations.
if (hasTimestampResultField) {
moveOrReplicateTimestampInRow(query, timestampResultFieldIndexInOriginalDimensions, row, rowWithPostAggregations);
} else {
for (int i = 0; i < query.getResultRowPostAggregatorStart(); i++) {
rowWithPostAggregations.set(i, row.get(i));
}
}
// Compute postaggregations. We need to do this with a result-row map because PostAggregator.compute
// expects a map. Some further design adjustment may eliminate the need for it, and speed up this function.
final Map<String, Object> mapForPostAggregationComputation = rowWithPostAggregations.toMap(query);
for (int i = 0; i < query.getPostAggregatorSpecs().size(); i++) {
final PostAggregator postAggregator = query.getPostAggregatorSpecs().get(i);
final Object value = postAggregator.compute(mapForPostAggregationComputation);
rowWithPostAggregations.set(query.getResultRowPostAggregatorStart() + i, value);
mapForPostAggregationComputation.put(postAggregator.getName(), value);
}
return rowWithPostAggregations;
});
}
}
Aggregations