Search in sources :

Example 6 with HyperLogLog

use of com.clearspring.analytics.stream.cardinality.HyperLogLog in project pinot by linkedin.

the class HllUtil method clone.

public static HyperLogLog clone(HyperLogLog hll, int log2m) {
    try {
        HyperLogLog ret = new HyperLogLog(log2m);
        ret.addAll(hll);
        return ret;
    } catch (CardinalityMergeException e) {
        throw new RuntimeException(e);
    }
}
Also used : HyperLogLog(com.clearspring.analytics.stream.cardinality.HyperLogLog) CardinalityMergeException(com.clearspring.analytics.stream.cardinality.CardinalityMergeException)

Example 7 with HyperLogLog

use of com.clearspring.analytics.stream.cardinality.HyperLogLog in project pinot by linkedin.

the class BaseHllStarTreeIndexTest method computeHll.

/**
   * Compute 'sum' for a given list of metrics, by scanning the given set of doc-ids.
   *
   * @param segment
   * @param docIdIterator
   * @param metricNames
   * @return
   */
private Map<String, long[]> computeHll(IndexSegment segment, BlockDocIdIterator docIdIterator, List<String> metricNames, List<String> groupByColumns) throws Exception {
    int docId;
    int numMetrics = metricNames.size();
    Dictionary[] metricDictionaries = new Dictionary[numMetrics];
    BlockSingleValIterator[] metricValIterators = new BlockSingleValIterator[numMetrics];
    int numGroupByColumns = groupByColumns.size();
    Dictionary[] groupByDictionaries = new Dictionary[numGroupByColumns];
    BlockSingleValIterator[] groupByValIterators = new BlockSingleValIterator[numGroupByColumns];
    for (int i = 0; i < numMetrics; i++) {
        String metricName = metricNames.get(i);
        DataSource dataSource = segment.getDataSource(metricName);
        metricDictionaries[i] = dataSource.getDictionary();
        metricValIterators[i] = (BlockSingleValIterator) dataSource.getNextBlock().getBlockValueSet().iterator();
    }
    for (int i = 0; i < numGroupByColumns; i++) {
        String groupByColumn = groupByColumns.get(i);
        DataSource dataSource = segment.getDataSource(groupByColumn);
        groupByDictionaries[i] = dataSource.getDictionary();
        groupByValIterators[i] = (BlockSingleValIterator) dataSource.getNextBlock().getBlockValueSet().iterator();
    }
    Map<String, HyperLogLog[]> result = new HashMap<>();
    while ((docId = docIdIterator.next()) != Constants.EOF) {
        StringBuilder stringBuilder = new StringBuilder();
        for (int i = 0; i < numGroupByColumns; i++) {
            groupByValIterators[i].skipTo(docId);
            int dictId = groupByValIterators[i].nextIntVal();
            stringBuilder.append(groupByDictionaries[i].getStringValue(dictId));
            stringBuilder.append("_");
        }
        String key = stringBuilder.toString();
        if (!result.containsKey(key)) {
            // init
            HyperLogLog[] initHllArray = new HyperLogLog[numMetrics];
            for (int i = 0; i < numMetrics; i++) {
                initHllArray[i] = new HyperLogLog(HLL_CONFIG.getHllLog2m());
            }
            result.put(key, initHllArray);
        }
        HyperLogLog[] hllSoFar = result.get(key);
        for (int i = 0; i < numMetrics; i++) {
            metricValIterators[i].skipTo(docId);
            int dictId = metricValIterators[i].nextIntVal();
            HyperLogLog value = HllUtil.convertStringToHll(metricDictionaries[i].getStringValue(dictId));
            hllSoFar[i].addAll(value);
        }
    }
    // construct ret
    Map<String, long[]> ret = new HashMap<>();
    for (String key : result.keySet()) {
        long[] valueArray = new long[numMetrics];
        ret.put(key, valueArray);
        for (int i = 0; i < numMetrics; i++) {
            valueArray[i] = result.get(key)[i].cardinality();
        }
    }
    return ret;
}
Also used : Dictionary(com.linkedin.pinot.core.segment.index.readers.Dictionary) HyperLogLog(com.clearspring.analytics.stream.cardinality.HyperLogLog)

Example 8 with HyperLogLog

use of com.clearspring.analytics.stream.cardinality.HyperLogLog in project pinot by linkedin.

the class HllFieldSizeTest method testHllFieldSerializedSize.

@Test
public void testHllFieldSerializedSize() throws Exception {
    for (int i = 5; i < 10; i++) {
        HyperLogLog hll = new HyperLogLog(i);
        Assert.assertEquals(HllUtil.getHllFieldSizeFromLog2m(i), hll.getBytes().length);
        LOGGER.info("Estimated: " + hll.cardinality());
        for (int j = 0; j < 100; j++) {
            hll.offer(rand.nextLong());
        }
        Assert.assertEquals(HllUtil.getHllFieldSizeFromLog2m(i), hll.getBytes().length);
        LOGGER.info("Estimated: " + hll.cardinality());
        for (int j = 0; j < 9900; j++) {
            hll.offer(rand.nextLong());
        }
        Assert.assertEquals(HllUtil.getHllFieldSizeFromLog2m(i), hll.getBytes().length);
        LOGGER.info("Estimated: " + hll.cardinality());
    }
}
Also used : HyperLogLog(com.clearspring.analytics.stream.cardinality.HyperLogLog) Test(org.testng.annotations.Test)

Example 9 with HyperLogLog

use of com.clearspring.analytics.stream.cardinality.HyperLogLog in project drill by apache.

the class HLLMergedStatistic method setOutput.

@Override
public void setOutput(MapVector output) {
    // Check the input is a Map Vector
    assert (output.getField().getType().getMinorType() == TypeProtos.MinorType.MAP);
    // Dependencies have been configured correctly
    assert (state == State.MERGE);
    for (ValueVector outMapCol : output) {
        String colName = outMapCol.getField().getName();
        HyperLogLog colHLLHolder = hllHolder.get(colName);
        NullableVarBinaryVector vv = (NullableVarBinaryVector) outMapCol;
        vv.allocateNewSafe();
        try {
            if (colHLLHolder != null) {
                vv.getMutator().setSafe(0, colHLLHolder.getBytes(), 0, colHLLHolder.getBytes().length);
            } else {
                vv.getMutator().setNull(0);
            }
        } catch (IOException ex) {
        // TODO: logger
        }
    }
    state = State.COMPLETE;
}
Also used : ValueVector(org.apache.drill.exec.vector.ValueVector) NullableVarBinaryVector(org.apache.drill.exec.vector.NullableVarBinaryVector) IOException(java.io.IOException) HyperLogLog(com.clearspring.analytics.stream.cardinality.HyperLogLog)

Example 10 with HyperLogLog

use of com.clearspring.analytics.stream.cardinality.HyperLogLog in project drill by apache.

the class NDVMergedStatistic method setOutput.

@Override
public void setOutput(MapVector output) {
    // Check the input is a Map Vector
    assert (output.getField().getType().getMinorType() == TypeProtos.MinorType.MAP);
    // Dependencies have been configured correctly
    assert (state == State.MERGE);
    for (ValueVector outMapCol : output) {
        String colName = outMapCol.getField().getName();
        HyperLogLog colHLLHolder = hllHolder.get(colName);
        NullableBigIntVector vv = (NullableBigIntVector) outMapCol;
        vv.allocateNewSafe();
        if (colHLLHolder != null) {
            /* Duj1 estimator - Peter J. Haas & Lynne Stokes (1998) Estimating the Number of Classes in a Finite Population,
         * Journal of the American Statistical Association, 93:444, 1475-1487
         * n*d / (n - f1 + f1*n/N) where
         * n  - sample rows
         * N  - total rows
         * d  - ndv of sample
         * f1 - number of singletons
         * Cap estimate at N
         */
            double sampleRows = (samplePercent / 100.0) * getRowCount(colName);
            double sampleSingletons = sampleRows - sumDups.getStat(colName);
            double estNdv = (sampleRows * colHLLHolder.cardinality()) / (sampleRows - sampleSingletons + sampleSingletons * samplePercent / 100.0);
            estNdv = Math.min(estNdv, 100.0 * sampleRows / samplePercent);
            vv.getMutator().setSafe(0, 1, (long) estNdv);
        } else {
            vv.getMutator().setNull(0);
        }
    }
    state = State.COMPLETE;
}
Also used : ValueVector(org.apache.drill.exec.vector.ValueVector) NullableBigIntVector(org.apache.drill.exec.vector.NullableBigIntVector) HyperLogLog(com.clearspring.analytics.stream.cardinality.HyperLogLog)

Aggregations

HyperLogLog (com.clearspring.analytics.stream.cardinality.HyperLogLog)18 CardinalityMergeException (com.clearspring.analytics.stream.cardinality.CardinalityMergeException)8 ValueVector (org.apache.drill.exec.vector.ValueVector)4 IOException (java.io.IOException)3 NullableVarBinaryVector (org.apache.drill.exec.vector.NullableVarBinaryVector)3 FieldSpec (com.linkedin.pinot.common.data.FieldSpec)2 ByteArrayInputStream (java.io.ByteArrayInputStream)2 DataInputStream (java.io.DataInputStream)2 Dictionary (com.linkedin.pinot.core.segment.index.readers.Dictionary)1 NullableBigIntVector (org.apache.drill.exec.vector.NullableBigIntVector)1 Test (org.testng.annotations.Test)1