use of io.druid.hll.HyperLogLogCollector in project druid by druid-io.
the class HyperUniquesAggregatorFactoryTest method testCompare1.
@Test
public void testCompare1() throws Exception {
HyperLogLogCollector collector1 = HyperLogLogCollector.makeLatestCollector();
HyperLogLogCollector collector2 = HyperLogLogCollector.makeLatestCollector();
collector1.add(fn.hashLong(0).asBytes());
HyperUniquesAggregatorFactory factory = new HyperUniquesAggregatorFactory("foo", "bar");
Comparator comparator = factory.getComparator();
for (int i = 1; i < 100; i = i + 2) {
collector1.add(fn.hashLong(i).asBytes());
collector2.add(fn.hashLong(i + 1).asBytes());
Assert.assertEquals(1, comparator.compare(collector1, collector2));
Assert.assertEquals(1, Double.compare(collector1.estimateCardinality(), collector2.estimateCardinality()));
}
}
use of io.druid.hll.HyperLogLogCollector in project druid by druid-io.
the class HyperUniquesAggregatorFactoryTest method testCompareToShouldBehaveConsistentlyWithEstimatedCardinalitiesEvenInToughCases.
@Test
public void testCompareToShouldBehaveConsistentlyWithEstimatedCardinalitiesEvenInToughCases() throws Exception {
// given
Random rand = new Random(0);
HyperUniquesAggregatorFactory factory = new HyperUniquesAggregatorFactory("foo", "bar");
Comparator comparator = factory.getComparator();
for (int i = 0; i < 1000; ++i) {
// given
HyperLogLogCollector leftCollector = HyperLogLogCollector.makeLatestCollector();
int j = rand.nextInt(9000) + 5000;
for (int l = 0; l < j; ++l) {
leftCollector.add(fn.hashLong(rand.nextLong()).asBytes());
}
HyperLogLogCollector rightCollector = HyperLogLogCollector.makeLatestCollector();
int k = rand.nextInt(9000) + 5000;
for (int l = 0; l < k; ++l) {
rightCollector.add(fn.hashLong(rand.nextLong()).asBytes());
}
// when
final int orderedByCardinality = Double.compare(leftCollector.estimateCardinality(), rightCollector.estimateCardinality());
final int orderedByComparator = comparator.compare(leftCollector, rightCollector);
// then, assert hyperloglog comparator behaves consistently with estimated cardinalities
Assert.assertEquals(String.format("orderedByComparator=%d, orderedByCardinality=%d,\n" + "Left={cardinality=%f, hll=%s},\n" + "Right={cardinality=%f, hll=%s},\n", orderedByComparator, orderedByCardinality, leftCollector.estimateCardinality(), leftCollector, rightCollector.estimateCardinality(), rightCollector), orderedByCardinality, orderedByComparator);
}
}
use of io.druid.hll.HyperLogLogCollector in project druid by druid-io.
the class IndexTask method determineShardSpecs.
/**
* Determines the number of shards for each interval using a hash of queryGranularity timestamp + all dimensions (i.e
* hash-based partitioning). In the future we may want to also support single-dimension partitioning.
*/
private Map<Interval, List<ShardSpec>> determineShardSpecs(final TaskToolbox toolbox, final FirehoseFactory firehoseFactory) throws IOException {
final ObjectMapper jsonMapper = toolbox.getObjectMapper();
final GranularitySpec granularitySpec = ingestionSchema.getDataSchema().getGranularitySpec();
final Granularity queryGranularity = granularitySpec.getQueryGranularity();
final boolean determineNumPartitions = ingestionSchema.getTuningConfig().getNumShards() == null;
final boolean determineIntervals = !ingestionSchema.getDataSchema().getGranularitySpec().bucketIntervals().isPresent();
final Map<Interval, List<ShardSpec>> shardSpecs = Maps.newHashMap();
// if we were given number of shards per interval and the intervals, we don't need to scan the data
if (!determineNumPartitions && !determineIntervals) {
log.info("numShards and intervals provided, skipping determine partition scan");
final SortedSet<Interval> intervals = ingestionSchema.getDataSchema().getGranularitySpec().bucketIntervals().get();
final int numShards = ingestionSchema.getTuningConfig().getNumShards();
for (Interval interval : intervals) {
final List<ShardSpec> intervalShardSpecs = Lists.newArrayListWithCapacity(numShards);
if (numShards > 1) {
for (int i = 0; i < numShards; i++) {
intervalShardSpecs.add(new HashBasedNumberedShardSpec(i, numShards, null, jsonMapper));
}
} else {
intervalShardSpecs.add(NoneShardSpec.instance());
}
shardSpecs.put(interval, intervalShardSpecs);
}
return shardSpecs;
}
// determine intervals containing data and prime HLL collectors
final Map<Interval, Optional<HyperLogLogCollector>> hllCollectors = Maps.newHashMap();
int thrownAway = 0;
log.info("Determining intervals and shardSpecs");
long determineShardSpecsStartMillis = System.currentTimeMillis();
try (final Firehose firehose = firehoseFactory.connect(ingestionSchema.getDataSchema().getParser())) {
while (firehose.hasMore()) {
final InputRow inputRow = firehose.nextRow();
final Interval interval;
if (determineIntervals) {
interval = granularitySpec.getSegmentGranularity().bucket(inputRow.getTimestamp());
} else {
final Optional<Interval> optInterval = granularitySpec.bucketInterval(inputRow.getTimestamp());
if (!optInterval.isPresent()) {
thrownAway++;
continue;
}
interval = optInterval.get();
}
if (!determineNumPartitions) {
// for the interval and don't instantiate a HLL collector
if (!hllCollectors.containsKey(interval)) {
hllCollectors.put(interval, Optional.<HyperLogLogCollector>absent());
}
continue;
}
if (!hllCollectors.containsKey(interval)) {
hllCollectors.put(interval, Optional.of(HyperLogLogCollector.makeLatestCollector()));
}
List<Object> groupKey = Rows.toGroupKey(queryGranularity.bucketStart(inputRow.getTimestamp()).getMillis(), inputRow);
hllCollectors.get(interval).get().add(hashFunction.hashBytes(jsonMapper.writeValueAsBytes(groupKey)).asBytes());
}
}
if (thrownAway > 0) {
log.warn("Unable to to find a matching interval for [%,d] events", thrownAway);
}
final ImmutableSortedMap<Interval, Optional<HyperLogLogCollector>> sortedMap = ImmutableSortedMap.copyOf(hllCollectors, Comparators.intervalsByStartThenEnd());
for (final Map.Entry<Interval, Optional<HyperLogLogCollector>> entry : sortedMap.entrySet()) {
final Interval interval = entry.getKey();
final Optional<HyperLogLogCollector> collector = entry.getValue();
final int numShards;
if (determineNumPartitions) {
final long numRows = new Double(collector.get().estimateCardinality()).longValue();
numShards = (int) Math.ceil((double) numRows / ingestionSchema.getTuningConfig().getTargetPartitionSize());
log.info("Estimated [%,d] rows of data for interval [%s], creating [%,d] shards", numRows, interval, numShards);
} else {
numShards = ingestionSchema.getTuningConfig().getNumShards();
log.info("Creating [%,d] shards for interval [%s]", numShards, interval);
}
final List<ShardSpec> intervalShardSpecs = Lists.newArrayListWithCapacity(numShards);
if (numShards > 1) {
for (int i = 0; i < numShards; i++) {
intervalShardSpecs.add(new HashBasedNumberedShardSpec(i, numShards, null, jsonMapper));
}
} else {
intervalShardSpecs.add(NoneShardSpec.instance());
}
shardSpecs.put(interval, intervalShardSpecs);
}
log.info("Found intervals and shardSpecs in %,dms", System.currentTimeMillis() - determineShardSpecsStartMillis);
return shardSpecs;
}
use of io.druid.hll.HyperLogLogCollector in project druid by druid-io.
the class HyperUniquesBufferAggregator method aggregate.
@Override
public void aggregate(ByteBuffer buf, int position) {
HyperLogLogCollector collector = (HyperLogLogCollector) selector.get();
if (collector == null) {
return;
}
// Save position, limit and restore later instead of allocating a new ByteBuffer object
final int oldPosition = buf.position();
final int oldLimit = buf.limit();
buf.limit(position + HyperLogLogCollector.getLatestNumBytesForDenseStorage());
buf.position(position);
try {
HyperLogLogCollector.makeCollector(buf).fold(collector);
} finally {
buf.limit(oldLimit);
buf.position(oldPosition);
}
}
use of io.druid.hll.HyperLogLogCollector in project druid by druid-io.
the class HyperUniquesSerde method getObjectStrategy.
@Override
public ObjectStrategy getObjectStrategy() {
return new ObjectStrategy<HyperLogLogCollector>() {
@Override
public Class<? extends HyperLogLogCollector> getClazz() {
return HyperLogLogCollector.class;
}
@Override
public HyperLogLogCollector fromByteBuffer(ByteBuffer buffer, int numBytes) {
final ByteBuffer readOnlyBuffer = buffer.asReadOnlyBuffer();
readOnlyBuffer.limit(readOnlyBuffer.position() + numBytes);
return HyperLogLogCollector.makeCollector(readOnlyBuffer);
}
@Override
public byte[] toBytes(HyperLogLogCollector collector) {
if (collector == null) {
return new byte[] {};
}
ByteBuffer val = collector.toByteBuffer();
byte[] retVal = new byte[val.remaining()];
val.asReadOnlyBuffer().get(retVal);
return retVal;
}
@Override
public int compare(HyperLogLogCollector o1, HyperLogLogCollector o2) {
return comparator.compare(o1, o2);
}
};
}
Aggregations