use of io.crate.statistics.Stats in project crate by crate.
the class SelectivityFunctionsTest method test_eq_not_in_mcv_is_based_on_approx_distinct.
@Test
public void test_eq_not_in_mcv_is_based_on_approx_distinct() {
SqlExpressions expressions = new SqlExpressions(T3.sources(clusterService));
Symbol query = expressions.asSymbol("x = 10");
var statsByColumn = new HashMap<ColumnIdent, ColumnStats>();
var numbers = IntStream.range(1, 20_001).boxed().collect(Collectors.toList());
var columnStats = ColumnStats.fromSortedValues(numbers, DataTypes.INTEGER, 0, 20_000L);
statsByColumn.put(new ColumnIdent("x"), columnStats);
Stats stats = new Stats(20_000, 16, statsByColumn);
assertThat(SelectivityFunctions.estimateNumRows(stats, query, null), Matchers.is(1L));
}
use of io.crate.statistics.Stats in project crate by crate.
the class GroupHashAggregate method approximateDistinctValues.
static long approximateDistinctValues(long numSourceRows, TableStats tableStats, List<Symbol> groupKeys) {
long distinctValues = 1;
int numKeysWithStats = 0;
for (Symbol groupKey : groupKeys) {
Stats stats = null;
ColumnStats columnStats = null;
if (groupKey instanceof Reference) {
Reference ref = (Reference) groupKey;
stats = tableStats.getStats(ref.ident().tableIdent());
columnStats = stats.statsByColumn().get(ref.column());
numKeysWithStats++;
} else if (groupKey instanceof ScopedSymbol) {
ScopedSymbol scopedSymbol = (ScopedSymbol) groupKey;
stats = tableStats.getStats(scopedSymbol.relation());
columnStats = stats.statsByColumn().get(scopedSymbol.column());
numKeysWithStats++;
}
if (columnStats == null) {
// Assume worst case: Every value is unique
distinctValues *= numSourceRows;
} else {
// `approxDistinct` is the number of distinct values in relation to `stats.numDocs()ยด, not in
// relation to `numSourceRows`, which is based on the estimates of a source operator.
// That is why we calculate the cardinality ratio and calculate the new distinct
// values based on `numSourceRows` to account for changes in the number of rows in source operators
//
// e.g. SELECT x, count(*) FROM tbl GROUP BY x
// and SELECT x, count(*) FROM tbl WHERE pk = 1 GROUP BY x
//
// have a different number of groups
double cardinalityRatio = columnStats.approxDistinct() / stats.numDocs();
distinctValues *= (long) (numSourceRows * cardinalityRatio);
}
}
if (numKeysWithStats == groupKeys.size()) {
return Math.min(distinctValues, numSourceRows);
} else {
return numSourceRows;
}
}
use of io.crate.statistics.Stats in project crate by crate.
the class MergeFilterAndCollect method apply.
@Override
public LogicalPlan apply(Filter filter, Captures captures, TableStats tableStats, TransactionContext txnCtx, NodeContext nodeCtx) {
Collect collect = captures.get(collectCapture);
Stats stats = tableStats.getStats(collect.relation().tableInfo().ident());
WhereClause newWhere = collect.where().add(filter.query());
return new Collect(collect.relation(), collect.outputs(), newWhere, SelectivityFunctions.estimateNumRows(stats, newWhere.queryOrFallback(), null), stats.averageSizePerRowInBytes());
}
use of io.crate.statistics.Stats in project crate by crate.
the class GroupHashAggregateTest method setUpStatsAndExpressions.
@Before
public void setUpStatsAndExpressions() throws Exception {
var samples = IntStream.concat(IntStream.generate(() -> 10).limit(50), IntStream.generate(() -> 20).limit(50)).boxed().collect(Collectors.toList());
long numDocs = 2_000L;
ColumnStats<Integer> columnStats = ColumnStats.fromSortedValues(samples, DataTypes.INTEGER, 0, numDocs);
Stats stats = new Stats(numDocs, DataTypes.INTEGER.fixedSize(), Map.of(new ColumnIdent("x"), columnStats, new ColumnIdent("i"), columnStats));
tableStats = new TableStats();
tableStats.updateTableStats(Map.of(new RelationName("doc", "t1"), stats));
expressions = new SqlExpressions(T3.sources(clusterService));
}
use of io.crate.statistics.Stats in project crate by crate.
the class SelectivityFunctionsCalculationTest method test_group_operator_adapt_expected_row_count_based_on_column_stats.
@Test
public void test_group_operator_adapt_expected_row_count_based_on_column_stats() throws Throwable {
var samples = IntStream.concat(IntStream.generate(() -> 10).limit(50), IntStream.generate(() -> 20).limit(50)).boxed().collect(Collectors.toList());
long numDocs = 2_000L;
Stats stats = new Stats(numDocs, DataTypes.INTEGER.fixedSize(), Map.of(new ColumnIdent("x"), ColumnStats.fromSortedValues(samples, DataTypes.INTEGER, 0, numDocs)));
TableStats tableStats = new TableStats();
tableStats.updateTableStats(Map.of(new RelationName("doc", "tbl"), stats));
SQLExecutor e = SQLExecutor.builder(clusterService).setTableStats(tableStats).addTable("create table doc.tbl (x int)").build();
LogicalPlan plan = e.logicalPlan("select x, count(*) from doc.tbl group by x");
assertThat(plan.numExpectedRows(), Matchers.is(2L));
}
Aggregations