use of org.apache.datasketches.hll.HllSketch in project druid by druid-io.
the class HllSketchMergeVectorAggregator method aggregate.
@Override
public void aggregate(final ByteBuffer buf, final int numRows, final int[] positions, @Nullable final int[] rows, final int positionOffset) {
final Object[] vector = objectSupplier.get();
for (int i = 0; i < numRows; i++) {
final HllSketch o = (HllSketch) vector[rows != null ? rows[i] : i];
if (o != null) {
final int position = positions[i] + positionOffset;
final WritableMemory mem = WritableMemory.writableWrap(buf, ByteOrder.LITTLE_ENDIAN).writableRegion(position, helper.getSize());
final Union union = Union.writableWrap(mem);
union.update(o);
}
}
}
use of org.apache.datasketches.hll.HllSketch in project druid by druid-io.
the class HllSketchUnionPostAggregator method compute.
@Override
public HllSketch compute(final Map<String, Object> combinedAggregators) {
final Union union = new Union(lgK);
for (final PostAggregator field : fields) {
final HllSketch sketch = (HllSketch) field.compute(combinedAggregators);
union.update(sketch);
}
return union.getResult(tgtHllType);
}
use of org.apache.datasketches.hll.HllSketch in project druid by druid-io.
the class ParallelIndexSupervisorTask method mergeCardinalityReports.
private static Map<Interval, Union> mergeCardinalityReports(Collection<DimensionCardinalityReport> reports) {
Map<Interval, Union> finalCollectors = new HashMap<>();
reports.forEach(report -> {
Map<Interval, byte[]> intervalToCardinality = report.getIntervalToCardinalities();
for (Map.Entry<Interval, byte[]> entry : intervalToCardinality.entrySet()) {
HllSketch entryHll = HllSketch.wrap(Memory.wrap(entry.getValue()));
finalCollectors.computeIfAbsent(entry.getKey(), k -> new Union(DimensionCardinalityReport.HLL_SKETCH_LOG_K)).update(entryHll);
}
});
return finalCollectors;
}
use of org.apache.datasketches.hll.HllSketch in project druid by druid-io.
the class PartialDimensionCardinalityTask method determineCardinalities.
private Map<Interval, byte[]> determineCardinalities(CloseableIterator<InputRow> inputRowIterator, GranularitySpec granularitySpec) {
Map<Interval, HllSketch> intervalToCardinalities = new HashMap<>();
while (inputRowIterator.hasNext()) {
InputRow inputRow = inputRowIterator.next();
// null rows are filtered out by FilteringCloseableInputRowIterator
DateTime timestamp = inputRow.getTimestamp();
final Interval interval;
if (granularitySpec.inputIntervals().isEmpty()) {
interval = granularitySpec.getSegmentGranularity().bucket(timestamp);
} else {
final Optional<Interval> optInterval = granularitySpec.bucketInterval(timestamp);
// this interval must exist since it passed the rowFilter
assert optInterval.isPresent();
interval = optInterval.get();
}
Granularity queryGranularity = granularitySpec.getQueryGranularity();
HllSketch hllSketch = intervalToCardinalities.computeIfAbsent(interval, (intervalKey) -> DimensionCardinalityReport.createHllSketchForReport());
// For cardinality estimation, we want to consider unique rows instead of unique hash buckets and therefore
// we do not use partition dimensions in computing the group key
List<Object> groupKey = HashPartitioner.extractKeys(Collections.emptyList(), queryGranularity.bucketStart(timestamp).getMillis(), inputRow);
try {
hllSketch.update(jsonMapper.writeValueAsBytes(groupKey));
} catch (JsonProcessingException jpe) {
throw new RuntimeException(jpe);
}
}
// Serialize the collectors for sending to the supervisor task
Map<Interval, byte[]> newMap = new HashMap<>();
for (Map.Entry<Interval, HllSketch> entry : intervalToCardinalities.entrySet()) {
newMap.put(entry.getKey(), entry.getValue().toCompactByteArray());
}
return newMap;
}
use of org.apache.datasketches.hll.HllSketch in project druid by druid-io.
the class DimensionCardinalityReportTest method testSupervisorDetermineNumShardsFromCardinalityReport.
@Test
public void testSupervisorDetermineNumShardsFromCardinalityReport() {
List<DimensionCardinalityReport> reports = new ArrayList<>();
HllSketch collector1 = DimensionCardinalityReport.createHllSketchForReport();
collector1.update(IndexTask.HASH_FUNCTION.hashLong(1L).asBytes());
collector1.update(IndexTask.HASH_FUNCTION.hashLong(200L).asBytes());
DimensionCardinalityReport report1 = new DimensionCardinalityReport("taskA", ImmutableMap.of(Intervals.of("1970-01-01T00:00:00.000Z/1970-01-02T00:00:00.000Z"), collector1.toCompactByteArray()));
reports.add(report1);
HllSketch collector2 = DimensionCardinalityReport.createHllSketchForReport();
collector2.update(IndexTask.HASH_FUNCTION.hashLong(1000L).asBytes());
collector2.update(IndexTask.HASH_FUNCTION.hashLong(30000L).asBytes());
DimensionCardinalityReport report2 = new DimensionCardinalityReport("taskB", ImmutableMap.of(Intervals.of("1970-01-01T00:00:00.000Z/1970-01-02T00:00:00.000Z"), collector2.toCompactByteArray()));
reports.add(report2);
// Separate interval with only 1 value
HllSketch collector3 = DimensionCardinalityReport.createHllSketchForReport();
collector3.update(IndexTask.HASH_FUNCTION.hashLong(99000L).asBytes());
DimensionCardinalityReport report3 = new DimensionCardinalityReport("taskC", ImmutableMap.of(Intervals.of("1970-01-02T00:00:00.000Z/1970-01-03T00:00:00.000Z"), collector3.toCompactByteArray()));
reports.add(report3);
// first interval in test has cardinality 4
Map<Interval, Integer> intervalToNumShards = ParallelIndexSupervisorTask.determineNumShardsFromCardinalityReport(reports, 1);
Assert.assertEquals(ImmutableMap.of(Intervals.of("1970-01-01/P1D"), 4, Intervals.of("1970-01-02/P1D"), 1), intervalToNumShards);
intervalToNumShards = ParallelIndexSupervisorTask.determineNumShardsFromCardinalityReport(reports, 2);
Assert.assertEquals(ImmutableMap.of(Intervals.of("1970-01-01/P1D"), 2, Intervals.of("1970-01-02/P1D"), 1), intervalToNumShards);
intervalToNumShards = ParallelIndexSupervisorTask.determineNumShardsFromCardinalityReport(reports, 3);
Assert.assertEquals(ImmutableMap.of(Intervals.of("1970-01-01/P1D"), 1, Intervals.of("1970-01-02/P1D"), 1), intervalToNumShards);
intervalToNumShards = ParallelIndexSupervisorTask.determineNumShardsFromCardinalityReport(reports, 4);
Assert.assertEquals(ImmutableMap.of(Intervals.of("1970-01-01/P1D"), 1, Intervals.of("1970-01-02/P1D"), 1), intervalToNumShards);
intervalToNumShards = ParallelIndexSupervisorTask.determineNumShardsFromCardinalityReport(reports, 5);
Assert.assertEquals(ImmutableMap.of(Intervals.of("1970-01-01/P1D"), 1, Intervals.of("1970-01-02/P1D"), 1), intervalToNumShards);
}
Aggregations