Search in sources :

Example 1 with HllOptions

use of org.apache.solr.handler.component.StatsField.HllOptions in project lucene-solr by apache.

the class StatsComponentTest method testHllOptions.

/**
   * whitebox test that HLL Option parsing does the right thing
   * @see #testCardinality 
   * @see #testHllOptionsErrors
   */
public void testHllOptions() throws Exception {
    SolrCore core = h.getCore();
    SchemaField field_l = core.getLatestSchema().getField("field_l");
    SchemaField field_d = core.getLatestSchema().getField("field_d");
    SchemaField field_dt = core.getLatestSchema().getField("field_dt");
    SchemaField field_s = core.getLatestSchema().getField("field_s");
    SchemaField field_i = core.getLatestSchema().getField("field_i");
    SchemaField field_f = core.getLatestSchema().getField("field_f");
    SchemaField field_severity = core.getLatestSchema().getField("severity");
    // simple cases that shouldn't use HLL
    assertNull(HllOptions.parseHllOptions(params(), field_l));
    assertNull(HllOptions.parseHllOptions(params("cardinality", "false"), field_l));
    // sanity check, future proof againts the HLL library changing stuff on us
    assertEquals("HLL Changed definition min for log2m, " + "need to note in upgrade instructions and maybe adjust accuracy hueristic", 4, HLL.MINIMUM_LOG2M_PARAM);
    // NOTE: https://github.com/aggregateknowledge/java-hll/issues/14
    assertEquals("HLL Changed definition max for log2m, " + "need to note in upgrade instructions and maybe adjust accuracy hueristic", 30, HLL.MAXIMUM_LOG2M_PARAM);
    assertEquals("HLL Changed definition min for regwidth, " + "need to note in upgrade instructions and probably adjust hueristic", 1, HLL.MINIMUM_REGWIDTH_PARAM);
    assertEquals("HLL Changed definition max for regwidth, " + "need to note in upgrade instructions and probably adjust hueristic", 8, HLL.MAXIMUM_REGWIDTH_PARAM);
    // all of these should produce equivilent HLLOptions (Long, Double, or String using defaults)
    SolrParams[] longDefaultParams = new SolrParams[] { // basic usage
    params("cardinality", "true"), params("cardinality", "0.33"), // expert level options
    params("cardinality", "true", "hllLog2m", "13"), params("cardinality", "true", "hllRegwidth", "6"), params("cardinality", "true", "hllPreHash", "false"), params("cardinality", "true", "hllLog2m", "13", "hllRegwidth", "6", "hllPreHash", "false"), // explicit hllLog2M should override numeric arg
    params("cardinality", "1.0", "hllLog2m", "13", "hllRegwidth", "6"), params("cardinality", "0.0", "hllLog2m", "13", "hllRegwidth", "6", "hllPreHash", "false") };
    for (SchemaField field : new SchemaField[] { field_l, field_d, field_dt, field_s }) {
        final String f = field.getName();
        for (SolrParams p : longDefaultParams) {
            HllOptions opts = HllOptions.parseHllOptions(p, field);
            assertEquals(f + " long defaults: " + p, 13, opts.getLog2m());
            assertEquals(f + " long defaults: " + p, 6, opts.getRegwidth());
            assertNotNull(f + " long defaults: " + p, opts.getHasher());
        }
        // non defaults: lower/upper accuracy bounds should give min/max log2m & adjusted regwidth
        HllOptions optsMin = HllOptions.parseHllOptions(params("cardinality", "0"), field);
        assertEquals(f + " min log2m", HLL.MINIMUM_LOG2M_PARAM, optsMin.getLog2m());
        // lowest hueristic for 64bit
        assertEquals(f + " min regwidth", 5, optsMin.getRegwidth());
        HllOptions optsMax = HllOptions.parseHllOptions(params("cardinality", "1"), field);
        assertEquals(f + " max log2m", HLL.MAXIMUM_LOG2M_PARAM, optsMax.getLog2m());
        assertEquals(f + " max regwidth", HLL.MAXIMUM_REGWIDTH_PARAM, optsMax.getRegwidth());
    }
    // all of these should produce equivilent HLLOptions (Int, Float, or ValueSource using defaults)
    SolrParams[] intDefaultParams = new SolrParams[] { // basic usage
    params("cardinality", "true"), params("cardinality", "0.33"), // expert level options
    params("cardinality", "true", "hllLog2m", "13"), params("cardinality", "true", "hllRegwidth", "5"), params("cardinality", "true", "hllPreHash", "false"), params("cardinality", "true", "hllLog2m", "13", "hllRegwidth", "5", "hllPreHash", "false"), // explicit hllLog2M & hllRegwidth should override hueristic float arg
    params("cardinality", "1.0", "hllLog2m", "13", "hllRegwidth", "5"), params("cardinality", "0.0", "hllLog2m", "13", "hllRegwidth", "5", "hllPreHash", "false") };
    for (SchemaField field : new SchemaField[] { field_i, field_f, field_severity, null }) {
        final String f = null == field ? "(func)" : field.getName();
        for (SolrParams p : intDefaultParams) {
            HllOptions opts = HllOptions.parseHllOptions(p, field);
            assertEquals(f + " int defaults: " + p, 13, opts.getLog2m());
            assertEquals(f + " int defaults: " + p, 5, opts.getRegwidth());
            assertNotNull(f + " int defaults: " + p, opts.getHasher());
        }
        // non defaults: lower/upper accuracy bounds should give min/max log2m & adjusted regwidth
        HllOptions optsMin = HllOptions.parseHllOptions(params("cardinality", "0"), field);
        assertEquals(f + " min log2m", HLL.MINIMUM_LOG2M_PARAM, optsMin.getLog2m());
        // lowest hueristic for 32bit
        assertEquals(f + " min regwidth", 4, optsMin.getRegwidth());
        HllOptions optsMax = HllOptions.parseHllOptions(params("cardinality", "1"), field);
        assertEquals(f + " max log2m", HLL.MAXIMUM_LOG2M_PARAM, optsMax.getLog2m());
        assertEquals(f + " max regwidth", HLL.MAXIMUM_REGWIDTH_PARAM, optsMax.getRegwidth());
    }
    // basic pre-hashed arg check specifically for long fields
    assertNotNull(HllOptions.parseHllOptions(params("cardinality", "true"), field_l).getHasher());
    assertNotNull(HllOptions.parseHllOptions(params("cardinality", "true", "hllPreHashed", "false"), field_l).getHasher());
    assertNull(HllOptions.parseHllOptions(params("cardinality", "true", "hllPreHashed", "true"), field_l).getHasher());
}
Also used : SchemaField(org.apache.solr.schema.SchemaField) HllOptions(org.apache.solr.handler.component.StatsField.HllOptions) SolrCore(org.apache.solr.core.SolrCore) SolrParams(org.apache.solr.common.params.SolrParams) MapSolrParams(org.apache.solr.common.params.MapSolrParams)

Example 2 with HllOptions

use of org.apache.solr.handler.component.StatsField.HllOptions in project lucene-solr by apache.

the class StatsComponentTest method testIndividualStatLocalParams.

public void testIndividualStatLocalParams() throws Exception {
    final String kpre = ExpectedStat.KPRE;
    assertU(adoc("id", "1", "a_f", "2.3", "b_f", "9.7", "a_i", "9", "foo_t", "how now brown cow"));
    assertU(commit());
    SolrCore core = h.getCore();
    SchemaField field = core.getLatestSchema().getField("a_i");
    HllOptions hllOpts = HllOptions.parseHllOptions(params("cardinality", "true"), field);
    HLL hll = hllOpts.newHLL();
    HashFunction hasher = hllOpts.getHasher();
    AVLTreeDigest tdigest = new AVLTreeDigest(100);
    // some quick sanity check assertions...
    // trivial check that we only get the exact 2 we ask for
    assertQ("ask for and get only 2 stats", req("q", "*:*", "stats", "true", "stats.field", "{!key=k mean=true min=true}a_i"), kpre + "double[@name='mean'][.='9.0']", kpre + "double[@name='min'][.='9.0']", "count(" + kpre + "*)=2");
    // for stats that are true/false, sanity check false does it's job
    assertQ("min=true & max=false: only min should come back", req("q", "*:*", "stats", "true", "stats.field", "{!key=k max=false min=true}a_i"), kpre + "double[@name='min'][.='9.0']", "count(" + kpre + "*)=1");
    assertQ("min=false: localparam stat means ignore default set, " + "but since only local param is false no stats should be returned", req("q", "*:*", "stats", "true", "stats.field", "{!key=k min=false}a_i"), // section of stats for this field should exist ...
    XPRE + "lst[@name='stats_fields']/lst[@name='k']", // ...but be empty 
    "count(" + kpre + "*)=0");
    double sum = 0;
    double sumOfSquares = 0;
    final int count = 20;
    for (int i = 0; i < count; i++) {
        int a_i = i % 10;
        assertU(adoc("id", String.valueOf(i), "a_f", "2.3", "b_f", "9.7", "a_i", String.valueOf(a_i), "foo_t", "how now brown cow"));
        tdigest.add(a_i);
        hll.addRaw(hasher.hashInt(a_i).asLong());
        sum += a_i;
        sumOfSquares += (a_i) * (a_i);
    }
    double stddev = Math.sqrt(((count * sumOfSquares) - (sum * sum)) / (20 * (count - 1.0D)));
    assertU(commit());
    ByteBuffer tdigestBuf = ByteBuffer.allocate(tdigest.smallByteSize());
    tdigest.asSmallBytes(tdigestBuf);
    byte[] hllBytes = hll.toBytes();
    EnumSet<Stat> allStats = EnumSet.allOf(Stat.class);
    final List<ExpectedStat> expected = new ArrayList<ExpectedStat>(allStats.size());
    ExpectedStat.createSimple(Stat.min, "true", "double", "0.0");
    ExpectedStat.createSimple(Stat.max, "true", "double", "9.0");
    ExpectedStat.createSimple(Stat.missing, "true", "long", "0");
    ExpectedStat.createSimple(Stat.sum, "true", "double", String.valueOf(sum));
    ExpectedStat.createSimple(Stat.count, "true", "long", String.valueOf(count));
    ExpectedStat.createSimple(Stat.mean, "true", "double", String.valueOf(sum / count));
    ExpectedStat.createSimple(Stat.sumOfSquares, "true", "double", String.valueOf(sumOfSquares));
    ExpectedStat.createSimple(Stat.stddev, "true", "double", String.valueOf(stddev));
    final String distinctValsXpath = "count(" + kpre + "arr[@name='distinctValues']/*)=10";
    ExpectedStat.create(Stat.distinctValues, "true", Collections.singletonList(distinctValsXpath), Collections.singletonList(distinctValsXpath));
    ExpectedStat.createSimple(Stat.countDistinct, "true", "long", "10");
    final String percentileShardXpath = kpre + "str[@name='percentiles'][.='" + Base64.byteArrayToBase64(tdigestBuf.array(), 0, tdigestBuf.array().length) + "']";
    final String p90 = "" + tdigest.quantile(0.90D);
    final String p99 = "" + tdigest.quantile(0.99D);
    ExpectedStat.create(Stat.percentiles, "'90, 99'", Collections.singletonList(percentileShardXpath), Arrays.asList("count(" + kpre + "lst[@name='percentiles']/*)=2", kpre + "lst[@name='percentiles']/double[@name='90.0'][.=" + p90 + "]", kpre + "lst[@name='percentiles']/double[@name='99.0'][.=" + p99 + "]"));
    final String cardinalityShardXpath = kpre + "str[@name='cardinality'][.='" + Base64.byteArrayToBase64(hllBytes, 0, hllBytes.length) + "']";
    final String cardinalityXpath = kpre + "long[@name='cardinality'][.='10']";
    ExpectedStat.create(Stat.cardinality, "true", Collections.singletonList(cardinalityShardXpath), Collections.singletonList(cardinalityXpath));
    // canary in the coal mine
    assertEquals("num of ExpectedStat doesn't match all known stats; " + "enum was updated w/o updating test?", ExpectedStat.ALL.size(), allStats.size());
    // whitebox test: explicitly ask for isShard=true with each individual stat
    for (ExpectedStat expect : ExpectedStat.ALL.values()) {
        Stat stat = expect.stat;
        StringBuilder exclude = new StringBuilder();
        List<String> testXpaths = new ArrayList<String>(5 + expect.perShardXpaths.size());
        testXpaths.addAll(expect.perShardXpaths);
        int numKeysExpected = 0;
        EnumSet<Stat> distribDeps = stat.getDistribDeps();
        for (Stat perShardDep : distribDeps) {
            numKeysExpected++;
            // the shard should return them since they are a dependency for the requested stat
            if (!stat.equals(perShardDep)) {
                // NOTE: this only works because all the cases where there are distribDeps
                // beyond a self dependency are simple true/false options
                exclude.append(perShardDep + "=false ");
            }
        }
        // we don't want to find anything we aren't expecting
        testXpaths.add("count(" + kpre + "*)=" + numKeysExpected);
        assertQ("ask for only " + stat + ", with isShard=true, and expect only deps: " + distribDeps, req("q", "*:*", "isShard", "true", "stats", "true", "stats.field", "{!key=k " + exclude + stat + "=" + expect.input + "}a_i"), testXpaths.toArray(new String[testXpaths.size()]));
    }
    // test all the possible combinations (of all possible sizes) of stats params
    for (int numParams = 1; numParams <= allStats.size(); numParams++) {
        for (EnumSet<Stat> set : new StatSetCombinations(numParams, allStats)) {
            // EnumSets use natural ordering, we want to randomize the order of the params
            List<Stat> combo = new ArrayList<Stat>(set);
            Collections.shuffle(combo, random());
            StringBuilder paras = new StringBuilder("{!key=k ");
            List<String> testXpaths = new ArrayList<String>(numParams + 5);
            int numKeysExpected = 0;
            for (Stat stat : combo) {
                ExpectedStat expect = ExpectedStat.ALL.get(stat);
                paras.append(stat + "=" + expect.input + " ");
                numKeysExpected++;
                testXpaths.addAll(expect.finalXpaths);
            }
            paras.append("}a_i");
            // we don't want to find anything we aren't expecting
            testXpaths.add("count(" + kpre + "*)=" + numKeysExpected);
            assertQ("ask for and get only: " + combo, req("q", "*:*", "stats", "true", "stats.field", paras.toString()), testXpaths.toArray(new String[testXpaths.size()]));
        }
    }
}
Also used : SolrCore(org.apache.solr.core.SolrCore) ArrayList(java.util.ArrayList) HLL(org.apache.solr.util.hll.HLL) ByteBuffer(java.nio.ByteBuffer) AVLTreeDigest(com.tdunning.math.stats.AVLTreeDigest) SchemaField(org.apache.solr.schema.SchemaField) HllOptions(org.apache.solr.handler.component.StatsField.HllOptions) Stat(org.apache.solr.handler.component.StatsField.Stat) HashFunction(com.google.common.hash.HashFunction)

Aggregations

SolrCore (org.apache.solr.core.SolrCore)2 HllOptions (org.apache.solr.handler.component.StatsField.HllOptions)2 SchemaField (org.apache.solr.schema.SchemaField)2 HashFunction (com.google.common.hash.HashFunction)1 AVLTreeDigest (com.tdunning.math.stats.AVLTreeDigest)1 ByteBuffer (java.nio.ByteBuffer)1 ArrayList (java.util.ArrayList)1 MapSolrParams (org.apache.solr.common.params.MapSolrParams)1 SolrParams (org.apache.solr.common.params.SolrParams)1 Stat (org.apache.solr.handler.component.StatsField.Stat)1 HLL (org.apache.solr.util.hll.HLL)1