Search in sources :

Example 6 with HyperLogLogCollector

use of io.druid.hll.HyperLogLogCollector in project druid by druid-io.

the class HyperUniquesAggregatorFactoryTest method testCompare1.

@Test
public void testCompare1() throws Exception {
    HyperLogLogCollector collector1 = HyperLogLogCollector.makeLatestCollector();
    HyperLogLogCollector collector2 = HyperLogLogCollector.makeLatestCollector();
    collector1.add(fn.hashLong(0).asBytes());
    HyperUniquesAggregatorFactory factory = new HyperUniquesAggregatorFactory("foo", "bar");
    Comparator comparator = factory.getComparator();
    for (int i = 1; i < 100; i = i + 2) {
        collector1.add(fn.hashLong(i).asBytes());
        collector2.add(fn.hashLong(i + 1).asBytes());
        Assert.assertEquals(1, comparator.compare(collector1, collector2));
        Assert.assertEquals(1, Double.compare(collector1.estimateCardinality(), collector2.estimateCardinality()));
    }
}
Also used : HyperLogLogCollector(io.druid.hll.HyperLogLogCollector) Comparator(java.util.Comparator) Test(org.junit.Test)

Example 7 with HyperLogLogCollector

use of io.druid.hll.HyperLogLogCollector in project druid by druid-io.

the class HyperUniquesAggregatorFactoryTest method testCompareToShouldBehaveConsistentlyWithEstimatedCardinalitiesEvenInToughCases.

@Test
public void testCompareToShouldBehaveConsistentlyWithEstimatedCardinalitiesEvenInToughCases() throws Exception {
    // given
    Random rand = new Random(0);
    HyperUniquesAggregatorFactory factory = new HyperUniquesAggregatorFactory("foo", "bar");
    Comparator comparator = factory.getComparator();
    for (int i = 0; i < 1000; ++i) {
        // given
        HyperLogLogCollector leftCollector = HyperLogLogCollector.makeLatestCollector();
        int j = rand.nextInt(9000) + 5000;
        for (int l = 0; l < j; ++l) {
            leftCollector.add(fn.hashLong(rand.nextLong()).asBytes());
        }
        HyperLogLogCollector rightCollector = HyperLogLogCollector.makeLatestCollector();
        int k = rand.nextInt(9000) + 5000;
        for (int l = 0; l < k; ++l) {
            rightCollector.add(fn.hashLong(rand.nextLong()).asBytes());
        }
        // when
        final int orderedByCardinality = Double.compare(leftCollector.estimateCardinality(), rightCollector.estimateCardinality());
        final int orderedByComparator = comparator.compare(leftCollector, rightCollector);
        // then, assert hyperloglog comparator behaves consistently with estimated cardinalities
        Assert.assertEquals(String.format("orderedByComparator=%d, orderedByCardinality=%d,\n" + "Left={cardinality=%f, hll=%s},\n" + "Right={cardinality=%f, hll=%s},\n", orderedByComparator, orderedByCardinality, leftCollector.estimateCardinality(), leftCollector, rightCollector.estimateCardinality(), rightCollector), orderedByCardinality, orderedByComparator);
    }
}
Also used : Random(java.util.Random) HyperLogLogCollector(io.druid.hll.HyperLogLogCollector) Comparator(java.util.Comparator) Test(org.junit.Test)

Example 8 with HyperLogLogCollector

use of io.druid.hll.HyperLogLogCollector in project druid by druid-io.

the class IndexTask method determineShardSpecs.

/**
   * Determines the number of shards for each interval using a hash of queryGranularity timestamp + all dimensions (i.e
   * hash-based partitioning). In the future we may want to also support single-dimension partitioning.
   */
private Map<Interval, List<ShardSpec>> determineShardSpecs(final TaskToolbox toolbox, final FirehoseFactory firehoseFactory) throws IOException {
    final ObjectMapper jsonMapper = toolbox.getObjectMapper();
    final GranularitySpec granularitySpec = ingestionSchema.getDataSchema().getGranularitySpec();
    final Granularity queryGranularity = granularitySpec.getQueryGranularity();
    final boolean determineNumPartitions = ingestionSchema.getTuningConfig().getNumShards() == null;
    final boolean determineIntervals = !ingestionSchema.getDataSchema().getGranularitySpec().bucketIntervals().isPresent();
    final Map<Interval, List<ShardSpec>> shardSpecs = Maps.newHashMap();
    // if we were given number of shards per interval and the intervals, we don't need to scan the data
    if (!determineNumPartitions && !determineIntervals) {
        log.info("numShards and intervals provided, skipping determine partition scan");
        final SortedSet<Interval> intervals = ingestionSchema.getDataSchema().getGranularitySpec().bucketIntervals().get();
        final int numShards = ingestionSchema.getTuningConfig().getNumShards();
        for (Interval interval : intervals) {
            final List<ShardSpec> intervalShardSpecs = Lists.newArrayListWithCapacity(numShards);
            if (numShards > 1) {
                for (int i = 0; i < numShards; i++) {
                    intervalShardSpecs.add(new HashBasedNumberedShardSpec(i, numShards, null, jsonMapper));
                }
            } else {
                intervalShardSpecs.add(NoneShardSpec.instance());
            }
            shardSpecs.put(interval, intervalShardSpecs);
        }
        return shardSpecs;
    }
    // determine intervals containing data and prime HLL collectors
    final Map<Interval, Optional<HyperLogLogCollector>> hllCollectors = Maps.newHashMap();
    int thrownAway = 0;
    log.info("Determining intervals and shardSpecs");
    long determineShardSpecsStartMillis = System.currentTimeMillis();
    try (final Firehose firehose = firehoseFactory.connect(ingestionSchema.getDataSchema().getParser())) {
        while (firehose.hasMore()) {
            final InputRow inputRow = firehose.nextRow();
            final Interval interval;
            if (determineIntervals) {
                interval = granularitySpec.getSegmentGranularity().bucket(inputRow.getTimestamp());
            } else {
                final Optional<Interval> optInterval = granularitySpec.bucketInterval(inputRow.getTimestamp());
                if (!optInterval.isPresent()) {
                    thrownAway++;
                    continue;
                }
                interval = optInterval.get();
            }
            if (!determineNumPartitions) {
                // for the interval and don't instantiate a HLL collector
                if (!hllCollectors.containsKey(interval)) {
                    hllCollectors.put(interval, Optional.<HyperLogLogCollector>absent());
                }
                continue;
            }
            if (!hllCollectors.containsKey(interval)) {
                hllCollectors.put(interval, Optional.of(HyperLogLogCollector.makeLatestCollector()));
            }
            List<Object> groupKey = Rows.toGroupKey(queryGranularity.bucketStart(inputRow.getTimestamp()).getMillis(), inputRow);
            hllCollectors.get(interval).get().add(hashFunction.hashBytes(jsonMapper.writeValueAsBytes(groupKey)).asBytes());
        }
    }
    if (thrownAway > 0) {
        log.warn("Unable to to find a matching interval for [%,d] events", thrownAway);
    }
    final ImmutableSortedMap<Interval, Optional<HyperLogLogCollector>> sortedMap = ImmutableSortedMap.copyOf(hllCollectors, Comparators.intervalsByStartThenEnd());
    for (final Map.Entry<Interval, Optional<HyperLogLogCollector>> entry : sortedMap.entrySet()) {
        final Interval interval = entry.getKey();
        final Optional<HyperLogLogCollector> collector = entry.getValue();
        final int numShards;
        if (determineNumPartitions) {
            final long numRows = new Double(collector.get().estimateCardinality()).longValue();
            numShards = (int) Math.ceil((double) numRows / ingestionSchema.getTuningConfig().getTargetPartitionSize());
            log.info("Estimated [%,d] rows of data for interval [%s], creating [%,d] shards", numRows, interval, numShards);
        } else {
            numShards = ingestionSchema.getTuningConfig().getNumShards();
            log.info("Creating [%,d] shards for interval [%s]", numShards, interval);
        }
        final List<ShardSpec> intervalShardSpecs = Lists.newArrayListWithCapacity(numShards);
        if (numShards > 1) {
            for (int i = 0; i < numShards; i++) {
                intervalShardSpecs.add(new HashBasedNumberedShardSpec(i, numShards, null, jsonMapper));
            }
        } else {
            intervalShardSpecs.add(NoneShardSpec.instance());
        }
        shardSpecs.put(interval, intervalShardSpecs);
    }
    log.info("Found intervals and shardSpecs in %,dms", System.currentTimeMillis() - determineShardSpecsStartMillis);
    return shardSpecs;
}
Also used : Granularity(io.druid.java.util.common.granularity.Granularity) NoneShardSpec(io.druid.timeline.partition.NoneShardSpec) ShardSpec(io.druid.timeline.partition.ShardSpec) NumberedShardSpec(io.druid.timeline.partition.NumberedShardSpec) HashBasedNumberedShardSpec(io.druid.timeline.partition.HashBasedNumberedShardSpec) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) HashBasedNumberedShardSpec(io.druid.timeline.partition.HashBasedNumberedShardSpec) Optional(com.google.common.base.Optional) Firehose(io.druid.data.input.Firehose) HyperLogLogCollector(io.druid.hll.HyperLogLogCollector) GranularitySpec(io.druid.segment.indexing.granularity.GranularitySpec) InputRow(io.druid.data.input.InputRow) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) ImmutableSortedMap(com.google.common.collect.ImmutableSortedMap) Interval(org.joda.time.Interval)

Example 9 with HyperLogLogCollector

use of io.druid.hll.HyperLogLogCollector in project druid by druid-io.

the class HyperUniquesBufferAggregator method aggregate.

@Override
public void aggregate(ByteBuffer buf, int position) {
    HyperLogLogCollector collector = (HyperLogLogCollector) selector.get();
    if (collector == null) {
        return;
    }
    // Save position, limit and restore later instead of allocating a new ByteBuffer object
    final int oldPosition = buf.position();
    final int oldLimit = buf.limit();
    buf.limit(position + HyperLogLogCollector.getLatestNumBytesForDenseStorage());
    buf.position(position);
    try {
        HyperLogLogCollector.makeCollector(buf).fold(collector);
    } finally {
        buf.limit(oldLimit);
        buf.position(oldPosition);
    }
}
Also used : HyperLogLogCollector(io.druid.hll.HyperLogLogCollector)

Example 10 with HyperLogLogCollector

use of io.druid.hll.HyperLogLogCollector in project druid by druid-io.

the class HyperUniquesSerde method getObjectStrategy.

@Override
public ObjectStrategy getObjectStrategy() {
    return new ObjectStrategy<HyperLogLogCollector>() {

        @Override
        public Class<? extends HyperLogLogCollector> getClazz() {
            return HyperLogLogCollector.class;
        }

        @Override
        public HyperLogLogCollector fromByteBuffer(ByteBuffer buffer, int numBytes) {
            final ByteBuffer readOnlyBuffer = buffer.asReadOnlyBuffer();
            readOnlyBuffer.limit(readOnlyBuffer.position() + numBytes);
            return HyperLogLogCollector.makeCollector(readOnlyBuffer);
        }

        @Override
        public byte[] toBytes(HyperLogLogCollector collector) {
            if (collector == null) {
                return new byte[] {};
            }
            ByteBuffer val = collector.toByteBuffer();
            byte[] retVal = new byte[val.remaining()];
            val.asReadOnlyBuffer().get(retVal);
            return retVal;
        }

        @Override
        public int compare(HyperLogLogCollector o1, HyperLogLogCollector o2) {
            return comparator.compare(o1, o2);
        }
    };
}
Also used : HyperLogLogCollector(io.druid.hll.HyperLogLogCollector) ObjectStrategy(io.druid.segment.data.ObjectStrategy) ByteBuffer(java.nio.ByteBuffer)

Aggregations

HyperLogLogCollector (io.druid.hll.HyperLogLogCollector)14 Test (org.junit.Test)6 InputRow (io.druid.data.input.InputRow)3 Comparator (java.util.Comparator)3 Random (java.util.Random)3 HyperUniquesAggregatorFactory (io.druid.query.aggregation.hyperloglog.HyperUniquesAggregatorFactory)2 ObjectStrategy (io.druid.segment.data.ObjectStrategy)2 ByteBuffer (java.nio.ByteBuffer)2 Interval (org.joda.time.Interval)2 ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)1 Optional (com.google.common.base.Optional)1 ImmutableList (com.google.common.collect.ImmutableList)1 ImmutableMap (com.google.common.collect.ImmutableMap)1 ImmutableSortedMap (com.google.common.collect.ImmutableSortedMap)1 HashFunction (com.google.common.hash.HashFunction)1 Firehose (io.druid.data.input.Firehose)1 Granularity (io.druid.java.util.common.granularity.Granularity)1 FileSmoosher (io.druid.java.util.common.io.smoosh.FileSmoosher)1 SmooshedFileMapper (io.druid.java.util.common.io.smoosh.SmooshedFileMapper)1 SmooshedWriter (io.druid.java.util.common.io.smoosh.SmooshedWriter)1