use of com.clearspring.analytics.stream.cardinality.HyperLogLog in project pinot by linkedin.
the class HllUtil method clone.
public static HyperLogLog clone(HyperLogLog hll, int log2m) {
try {
HyperLogLog ret = new HyperLogLog(log2m);
ret.addAll(hll);
return ret;
} catch (CardinalityMergeException e) {
throw new RuntimeException(e);
}
}
use of com.clearspring.analytics.stream.cardinality.HyperLogLog in project pinot by linkedin.
the class BaseHllStarTreeIndexTest method computeHll.
/**
* Compute 'sum' for a given list of metrics, by scanning the given set of doc-ids.
*
* @param segment
* @param docIdIterator
* @param metricNames
* @return
*/
private Map<String, long[]> computeHll(IndexSegment segment, BlockDocIdIterator docIdIterator, List<String> metricNames, List<String> groupByColumns) throws Exception {
int docId;
int numMetrics = metricNames.size();
Dictionary[] metricDictionaries = new Dictionary[numMetrics];
BlockSingleValIterator[] metricValIterators = new BlockSingleValIterator[numMetrics];
int numGroupByColumns = groupByColumns.size();
Dictionary[] groupByDictionaries = new Dictionary[numGroupByColumns];
BlockSingleValIterator[] groupByValIterators = new BlockSingleValIterator[numGroupByColumns];
for (int i = 0; i < numMetrics; i++) {
String metricName = metricNames.get(i);
DataSource dataSource = segment.getDataSource(metricName);
metricDictionaries[i] = dataSource.getDictionary();
metricValIterators[i] = (BlockSingleValIterator) dataSource.getNextBlock().getBlockValueSet().iterator();
}
for (int i = 0; i < numGroupByColumns; i++) {
String groupByColumn = groupByColumns.get(i);
DataSource dataSource = segment.getDataSource(groupByColumn);
groupByDictionaries[i] = dataSource.getDictionary();
groupByValIterators[i] = (BlockSingleValIterator) dataSource.getNextBlock().getBlockValueSet().iterator();
}
Map<String, HyperLogLog[]> result = new HashMap<>();
while ((docId = docIdIterator.next()) != Constants.EOF) {
StringBuilder stringBuilder = new StringBuilder();
for (int i = 0; i < numGroupByColumns; i++) {
groupByValIterators[i].skipTo(docId);
int dictId = groupByValIterators[i].nextIntVal();
stringBuilder.append(groupByDictionaries[i].getStringValue(dictId));
stringBuilder.append("_");
}
String key = stringBuilder.toString();
if (!result.containsKey(key)) {
// init
HyperLogLog[] initHllArray = new HyperLogLog[numMetrics];
for (int i = 0; i < numMetrics; i++) {
initHllArray[i] = new HyperLogLog(HLL_CONFIG.getHllLog2m());
}
result.put(key, initHllArray);
}
HyperLogLog[] hllSoFar = result.get(key);
for (int i = 0; i < numMetrics; i++) {
metricValIterators[i].skipTo(docId);
int dictId = metricValIterators[i].nextIntVal();
HyperLogLog value = HllUtil.convertStringToHll(metricDictionaries[i].getStringValue(dictId));
hllSoFar[i].addAll(value);
}
}
// construct ret
Map<String, long[]> ret = new HashMap<>();
for (String key : result.keySet()) {
long[] valueArray = new long[numMetrics];
ret.put(key, valueArray);
for (int i = 0; i < numMetrics; i++) {
valueArray[i] = result.get(key)[i].cardinality();
}
}
return ret;
}
use of com.clearspring.analytics.stream.cardinality.HyperLogLog in project pinot by linkedin.
the class HllFieldSizeTest method testHllFieldSerializedSize.
@Test
public void testHllFieldSerializedSize() throws Exception {
for (int i = 5; i < 10; i++) {
HyperLogLog hll = new HyperLogLog(i);
Assert.assertEquals(HllUtil.getHllFieldSizeFromLog2m(i), hll.getBytes().length);
LOGGER.info("Estimated: " + hll.cardinality());
for (int j = 0; j < 100; j++) {
hll.offer(rand.nextLong());
}
Assert.assertEquals(HllUtil.getHllFieldSizeFromLog2m(i), hll.getBytes().length);
LOGGER.info("Estimated: " + hll.cardinality());
for (int j = 0; j < 9900; j++) {
hll.offer(rand.nextLong());
}
Assert.assertEquals(HllUtil.getHllFieldSizeFromLog2m(i), hll.getBytes().length);
LOGGER.info("Estimated: " + hll.cardinality());
}
}
use of com.clearspring.analytics.stream.cardinality.HyperLogLog in project drill by apache.
the class HLLMergedStatistic method setOutput.
@Override
public void setOutput(MapVector output) {
// Check the input is a Map Vector
assert (output.getField().getType().getMinorType() == TypeProtos.MinorType.MAP);
// Dependencies have been configured correctly
assert (state == State.MERGE);
for (ValueVector outMapCol : output) {
String colName = outMapCol.getField().getName();
HyperLogLog colHLLHolder = hllHolder.get(colName);
NullableVarBinaryVector vv = (NullableVarBinaryVector) outMapCol;
vv.allocateNewSafe();
try {
if (colHLLHolder != null) {
vv.getMutator().setSafe(0, colHLLHolder.getBytes(), 0, colHLLHolder.getBytes().length);
} else {
vv.getMutator().setNull(0);
}
} catch (IOException ex) {
// TODO: logger
}
}
state = State.COMPLETE;
}
use of com.clearspring.analytics.stream.cardinality.HyperLogLog in project drill by apache.
the class NDVMergedStatistic method setOutput.
@Override
public void setOutput(MapVector output) {
// Check the input is a Map Vector
assert (output.getField().getType().getMinorType() == TypeProtos.MinorType.MAP);
// Dependencies have been configured correctly
assert (state == State.MERGE);
for (ValueVector outMapCol : output) {
String colName = outMapCol.getField().getName();
HyperLogLog colHLLHolder = hllHolder.get(colName);
NullableBigIntVector vv = (NullableBigIntVector) outMapCol;
vv.allocateNewSafe();
if (colHLLHolder != null) {
/* Duj1 estimator - Peter J. Haas & Lynne Stokes (1998) Estimating the Number of Classes in a Finite Population,
* Journal of the American Statistical Association, 93:444, 1475-1487
* n*d / (n - f1 + f1*n/N) where
* n - sample rows
* N - total rows
* d - ndv of sample
* f1 - number of singletons
* Cap estimate at N
*/
double sampleRows = (samplePercent / 100.0) * getRowCount(colName);
double sampleSingletons = sampleRows - sumDups.getStat(colName);
double estNdv = (sampleRows * colHLLHolder.cardinality()) / (sampleRows - sampleSingletons + sampleSingletons * samplePercent / 100.0);
estNdv = Math.min(estNdv, 100.0 * sampleRows / samplePercent);
vv.getMutator().setSafe(0, 1, (long) estNdv);
} else {
vv.getMutator().setNull(0);
}
}
state = State.COMPLETE;
}
Aggregations