use of io.crate.statistics.ColumnStats in project crate by crate.
the class GroupHashAggregate method approximateDistinctValues.
static long approximateDistinctValues(long numSourceRows, TableStats tableStats, List<Symbol> groupKeys) {
long distinctValues = 1;
int numKeysWithStats = 0;
for (Symbol groupKey : groupKeys) {
Stats stats = null;
ColumnStats columnStats = null;
if (groupKey instanceof Reference) {
Reference ref = (Reference) groupKey;
stats = tableStats.getStats(ref.ident().tableIdent());
columnStats = stats.statsByColumn().get(ref.column());
numKeysWithStats++;
} else if (groupKey instanceof ScopedSymbol) {
ScopedSymbol scopedSymbol = (ScopedSymbol) groupKey;
stats = tableStats.getStats(scopedSymbol.relation());
columnStats = stats.statsByColumn().get(scopedSymbol.column());
numKeysWithStats++;
}
if (columnStats == null) {
// Assume worst case: Every value is unique
distinctValues *= numSourceRows;
} else {
// `approxDistinct` is the number of distinct values in relation to `stats.numDocs()ยด, not in
// relation to `numSourceRows`, which is based on the estimates of a source operator.
// That is why we calculate the cardinality ratio and calculate the new distinct
// values based on `numSourceRows` to account for changes in the number of rows in source operators
//
// e.g. SELECT x, count(*) FROM tbl GROUP BY x
// and SELECT x, count(*) FROM tbl WHERE pk = 1 GROUP BY x
//
// have a different number of groups
double cardinalityRatio = columnStats.approxDistinct() / stats.numDocs();
distinctValues *= (long) (numSourceRows * cardinalityRatio);
}
}
if (numKeysWithStats == groupKeys.size()) {
return Math.min(distinctValues, numSourceRows);
} else {
return numSourceRows;
}
}
use of io.crate.statistics.ColumnStats in project crate by crate.
the class GroupHashAggregateTest method setUpStatsAndExpressions.
@Before
public void setUpStatsAndExpressions() throws Exception {
var samples = IntStream.concat(IntStream.generate(() -> 10).limit(50), IntStream.generate(() -> 20).limit(50)).boxed().collect(Collectors.toList());
long numDocs = 2_000L;
ColumnStats<Integer> columnStats = ColumnStats.fromSortedValues(samples, DataTypes.INTEGER, 0, numDocs);
Stats stats = new Stats(numDocs, DataTypes.INTEGER.fixedSize(), Map.of(new ColumnIdent("x"), columnStats, new ColumnIdent("i"), columnStats));
tableStats = new TableStats();
tableStats.updateTableStats(Map.of(new RelationName("doc", "t1"), stats));
expressions = new SqlExpressions(T3.sources(clusterService));
}
use of io.crate.statistics.ColumnStats in project crate by crate.
the class SelectivityFunctionsTest method test_col_is_null_uses_null_fraction_as_selectivity.
@Test
public void test_col_is_null_uses_null_fraction_as_selectivity() {
SqlExpressions expressions = new SqlExpressions(T3.sources(clusterService));
Symbol query = expressions.asSymbol("x is null");
var columnStats = ColumnStats.fromSortedValues(List.of(1, 2), DataTypes.INTEGER, 2, 4);
assertThat(columnStats.nullFraction(), Matchers.is(0.5));
Stats stats = new Stats(100, 16, Map.of(new ColumnIdent("x"), columnStats));
assertThat(SelectivityFunctions.estimateNumRows(stats, query, null), Matchers.is(50L));
}
use of io.crate.statistics.ColumnStats in project crate by crate.
the class SelectivityFunctionsTest method test_eq_null_value_is_always_0.
@Test
public void test_eq_null_value_is_always_0() {
SqlExpressions expressions = new SqlExpressions(T3.sources(clusterService));
Symbol query = expressions.asSymbol("x = null");
var numbers = IntStream.range(1, 50).boxed().collect(Collectors.toList());
var columnStats = ColumnStats.fromSortedValues(numbers, DataTypes.INTEGER, 0, 20_000L);
var statsByColumn = new HashMap<ColumnIdent, ColumnStats>();
statsByColumn.put(new ColumnIdent("x"), columnStats);
Stats stats = new Stats(20_000, 16, statsByColumn);
assertThat(SelectivityFunctions.estimateNumRows(stats, query, null), Matchers.is(0L));
}
Aggregations