use of org.apache.solr.client.solrj.response.FieldStatsInfo in project lucene-solr by apache.
the class TestCloudPivotFacet method assertPivotStats.
/**
* Compare top level stats in response with stats from pivot constraint
*/
private void assertPivotStats(String message, PivotField constraint, QueryResponse response) {
if (null == constraint.getFieldStatsInfo()) {
// no stats for this pivot, nothing to check
// TODO: use a trace param to know if/how-many to expect ?
log.info("No stats to check for => " + message);
return;
}
Map<String, FieldStatsInfo> actualFieldStatsInfoMap = response.getFieldStatsInfo();
for (FieldStatsInfo pivotStats : constraint.getFieldStatsInfo().values()) {
String statsKey = pivotStats.getName();
FieldStatsInfo actualStats = actualFieldStatsInfoMap.get(statsKey);
if (actualStats == null) {
// handle case for not found stats (using stats query)
//
// these has to be a special case check due to the legacy behavior of "top level"
// StatsComponent results being "null" (and not even included in the
// getFieldStatsInfo() Map due to specila SolrJ logic)
log.info("Requested stats missing in verification query, pivot stats: " + pivotStats);
assertEquals("Special Count", 0L, pivotStats.getCount().longValue());
assertEquals("Special Missing", constraint.getCount(), pivotStats.getMissing().longValue());
} else {
assert actualStats != null;
String msg = " of " + statsKey + " => " + message;
// no wiggle room, these should always be exactly equals, regardless of field type
assertEquals("Count" + msg, pivotStats.getCount(), actualStats.getCount());
assertEquals("Missing" + msg, pivotStats.getMissing(), actualStats.getMissing());
assertEquals("Min" + msg, pivotStats.getMin(), actualStats.getMin());
assertEquals("Max" + msg, pivotStats.getMax(), actualStats.getMax());
// precision loss can affect these in some field types depending on shards used
// and the order that values are accumulated
assertNumerics("Sum" + msg, pivotStats.getSum(), actualStats.getSum());
assertNumerics("Mean" + msg, pivotStats.getMean(), actualStats.getMean());
assertNumerics("Stddev" + msg, pivotStats.getStddev(), actualStats.getStddev());
assertNumerics("SumOfSquares" + msg, pivotStats.getSumOfSquares(), actualStats.getSumOfSquares());
}
}
if (constraint.getFieldStatsInfo().containsKey("sk2")) {
// cheeseball hack
// if "sk2" was one of hte stats we computed, then we must have also seen
// sk1 or sk3 because of the way the tags are fixed
assertEquals("had stats sk2, but not another stat?", 2, constraint.getFieldStatsInfo().size());
} else {
// if we did not see "sk2", then 1 of the others must be alone
assertEquals("only expected 1 stat", 1, constraint.getFieldStatsInfo().size());
assertTrue("not sk1 or sk3", constraint.getFieldStatsInfo().containsKey("sk1") || constraint.getFieldStatsInfo().containsKey("sk3"));
}
}
use of org.apache.solr.client.solrj.response.FieldStatsInfo in project lucene-solr by apache.
the class DistributedFacetPivotLargeTest method doTestDeepPivotStats.
private void doTestDeepPivotStats() throws Exception {
QueryResponse rsp = query("q", "*:*", "rows", "0", "facet", "true", "facet.pivot", "{!stats=s1}place_s,company_t", "stats", "true", "stats.field", "{!key=avg_price tag=s1}pay_i");
List<PivotField> pivots = rsp.getFacetPivot().get("place_s,company_t");
PivotField cardiffPivotField = pivots.get(0);
assertEquals("cardiff", cardiffPivotField.getValue());
assertEquals(257, cardiffPivotField.getCount());
FieldStatsInfo cardiffStatsInfo = cardiffPivotField.getFieldStatsInfo().get("avg_price");
assertEquals("avg_price", cardiffStatsInfo.getName());
assertEquals(0.0, cardiffStatsInfo.getMin());
assertEquals(8742.0, cardiffStatsInfo.getMax());
assertEquals(257, (long) cardiffStatsInfo.getCount());
assertEquals(0, (long) cardiffStatsInfo.getMissing());
assertEquals(347554.0, cardiffStatsInfo.getSum());
assertEquals(8.20968772E8, cardiffStatsInfo.getSumOfSquares(), 0.1E-7);
assertEquals(1352.35019455253, (double) cardiffStatsInfo.getMean(), 0.1E-7);
assertEquals(1170.86048165857, cardiffStatsInfo.getStddev(), 0.1E-7);
PivotField bbcCardifftPivotField = cardiffPivotField.getPivot().get(0);
assertEquals("bbc", bbcCardifftPivotField.getValue());
assertEquals(101, bbcCardifftPivotField.getCount());
FieldStatsInfo bbcCardifftPivotFieldStatsInfo = bbcCardifftPivotField.getFieldStatsInfo().get("avg_price");
assertEquals(2400.0, bbcCardifftPivotFieldStatsInfo.getMin());
assertEquals(8742.0, bbcCardifftPivotFieldStatsInfo.getMax());
assertEquals(101, (long) bbcCardifftPivotFieldStatsInfo.getCount());
assertEquals(0, (long) bbcCardifftPivotFieldStatsInfo.getMissing());
assertEquals(248742.0, bbcCardifftPivotFieldStatsInfo.getSum());
assertEquals(6.52422564E8, bbcCardifftPivotFieldStatsInfo.getSumOfSquares(), 0.1E-7);
assertEquals(2462.792079208, (double) bbcCardifftPivotFieldStatsInfo.getMean(), 0.1E-7);
assertEquals(631.0525860312, bbcCardifftPivotFieldStatsInfo.getStddev(), 0.1E-7);
PivotField placeholder0PivotField = pivots.get(2);
assertEquals("0placeholder", placeholder0PivotField.getValue());
assertEquals(6, placeholder0PivotField.getCount());
FieldStatsInfo placeholder0PivotFieldStatsInfo = placeholder0PivotField.getFieldStatsInfo().get("avg_price");
assertEquals("avg_price", placeholder0PivotFieldStatsInfo.getName());
assertEquals(2000.0, placeholder0PivotFieldStatsInfo.getMin());
assertEquals(6400.0, placeholder0PivotFieldStatsInfo.getMax());
assertEquals(6, (long) placeholder0PivotFieldStatsInfo.getCount());
assertEquals(0, (long) placeholder0PivotFieldStatsInfo.getMissing());
assertEquals(22700.0, placeholder0PivotFieldStatsInfo.getSum());
assertEquals(1.0105E8, placeholder0PivotFieldStatsInfo.getSumOfSquares(), 0.1E-7);
assertEquals(3783.333333333, (double) placeholder0PivotFieldStatsInfo.getMean(), 0.1E-7);
assertEquals(1741.742422595, placeholder0PivotFieldStatsInfo.getStddev(), 0.1E-7);
PivotField microsoftPlaceholder0PivotField = placeholder0PivotField.getPivot().get(1);
assertEquals("microsoft", microsoftPlaceholder0PivotField.getValue());
assertEquals(6, microsoftPlaceholder0PivotField.getCount());
FieldStatsInfo microsoftPlaceholder0PivotFieldStatsInfo = microsoftPlaceholder0PivotField.getFieldStatsInfo().get("avg_price");
assertEquals("avg_price", microsoftPlaceholder0PivotFieldStatsInfo.getName());
assertEquals(2000.0, microsoftPlaceholder0PivotFieldStatsInfo.getMin());
assertEquals(6400.0, microsoftPlaceholder0PivotFieldStatsInfo.getMax());
assertEquals(6, (long) microsoftPlaceholder0PivotFieldStatsInfo.getCount());
assertEquals(0, (long) microsoftPlaceholder0PivotFieldStatsInfo.getMissing());
assertEquals(22700.0, microsoftPlaceholder0PivotFieldStatsInfo.getSum());
assertEquals(1.0105E8, microsoftPlaceholder0PivotFieldStatsInfo.getSumOfSquares(), 0.1E-7);
assertEquals(3783.333333333, (double) microsoftPlaceholder0PivotFieldStatsInfo.getMean(), 0.1E-7);
assertEquals(1741.742422595, microsoftPlaceholder0PivotFieldStatsInfo.getStddev(), 0.1E-7);
}
use of org.apache.solr.client.solrj.response.FieldStatsInfo in project lucene-solr by apache.
the class DistributedFacetPivotLongTailTest method doTestDeepPivotStats.
public void doTestDeepPivotStats() throws Exception {
// Deep checking of some Facet stats - no refinement involved here
List<PivotField> pivots = query("q", "*:*", "shards", getShardsString(), "facet", "true", "rows", "0", "facet.pivot", "{!stats=s1}foo_s,bar_s", "stats", "true", "stats.field", "{!key=avg_price tag=s1}stat_i").getFacetPivot().get("foo_s,bar_s");
PivotField aaa0PivotField = pivots.get(0);
assertEquals("aaa0", aaa0PivotField.getValue());
assertEquals(300, aaa0PivotField.getCount());
FieldStatsInfo aaa0StatsInfo = aaa0PivotField.getFieldStatsInfo().get("avg_price");
assertEquals("avg_price", aaa0StatsInfo.getName());
assertEquals(-99.0, aaa0StatsInfo.getMin());
assertEquals(693.0, aaa0StatsInfo.getMax());
assertEquals(300, (long) aaa0StatsInfo.getCount());
assertEquals(0, (long) aaa0StatsInfo.getMissing());
assertEquals(34650.0, aaa0StatsInfo.getSum());
assertEquals(1.674585E7, aaa0StatsInfo.getSumOfSquares(), 0.1E-7);
assertEquals(115.5, (double) aaa0StatsInfo.getMean(), 0.1E-7);
assertEquals(206.4493184076, aaa0StatsInfo.getStddev(), 0.1E-7);
PivotField tailPivotField = pivots.get(5);
assertEquals("tail", tailPivotField.getValue());
assertEquals(135, tailPivotField.getCount());
FieldStatsInfo tailPivotFieldStatsInfo = tailPivotField.getFieldStatsInfo().get("avg_price");
assertEquals("avg_price", tailPivotFieldStatsInfo.getName());
assertEquals(0.0, tailPivotFieldStatsInfo.getMin());
assertEquals(44.0, tailPivotFieldStatsInfo.getMax());
assertEquals(90, (long) tailPivotFieldStatsInfo.getCount());
assertEquals(45, (long) tailPivotFieldStatsInfo.getMissing());
assertEquals(1980.0, tailPivotFieldStatsInfo.getSum());
assertEquals(22.0, (double) tailPivotFieldStatsInfo.getMean(), 0.1E-7);
assertEquals(58740.0, tailPivotFieldStatsInfo.getSumOfSquares(), 0.1E-7);
assertEquals(13.0599310011, tailPivotFieldStatsInfo.getStddev(), 0.1E-7);
PivotField tailBPivotField = tailPivotField.getPivot().get(0);
assertEquals("tailB", tailBPivotField.getValue());
assertEquals(17, tailBPivotField.getCount());
FieldStatsInfo tailBPivotFieldStatsInfo = tailBPivotField.getFieldStatsInfo().get("avg_price");
assertEquals("avg_price", tailBPivotFieldStatsInfo.getName());
assertEquals(35.0, tailBPivotFieldStatsInfo.getMin());
assertEquals(40.0, tailBPivotFieldStatsInfo.getMax());
assertEquals(12, (long) tailBPivotFieldStatsInfo.getCount());
assertEquals(5, (long) tailBPivotFieldStatsInfo.getMissing());
assertEquals(450.0, tailBPivotFieldStatsInfo.getSum());
assertEquals(37.5, (double) tailBPivotFieldStatsInfo.getMean(), 0.1E-7);
assertEquals(16910.0, tailBPivotFieldStatsInfo.getSumOfSquares(), 0.1E-7);
assertEquals(1.78376517, tailBPivotFieldStatsInfo.getStddev(), 0.1E-7);
}
use of org.apache.solr.client.solrj.response.FieldStatsInfo in project lucene-solr by apache.
the class DistributedFacetPivotSmallAdvancedTest method doTestTopStatsWithRefinement.
/**
* we need to ensure that stats never "overcount" the values from a single shard
* even if we hit that shard with a refinement request
*/
private void doTestTopStatsWithRefinement(final boolean allStats) throws Exception {
String stat_param = allStats ? "{!tag=s1}foo_i" : "{!tag=s1 min=true max=true count=true missing=true}foo_i";
ModifiableSolrParams coreParams = params("q", "*:*", "rows", "0", "stats", "true", "stats.field", stat_param);
ModifiableSolrParams facetParams = new ModifiableSolrParams(coreParams);
facetParams.add(params("facet", "true", "facet.limit", "1", "facet.pivot", "{!stats=s1}place_t,company_t"));
ModifiableSolrParams facetForceRefineParams = new ModifiableSolrParams(facetParams);
facetForceRefineParams.add(params(FacetParams.FACET_OVERREQUEST_COUNT, "0", FacetParams.FACET_OVERREQUEST_RATIO, "0"));
for (ModifiableSolrParams params : new ModifiableSolrParams[] { coreParams, facetParams, facetForceRefineParams }) {
// for all three sets of these params, the "top level"
// stats in the response of a distributed query should be the same
ModifiableSolrParams q = new ModifiableSolrParams(params);
q.set("shards", getShardsString());
QueryResponse rsp = queryServer(q);
FieldStatsInfo fieldStatsInfo = rsp.getFieldStatsInfo().get("foo_i");
String msg = q.toString();
assertEquals(msg, 3.0, fieldStatsInfo.getMin());
assertEquals(msg, 91.0, fieldStatsInfo.getMax());
assertEquals(msg, 10, (long) fieldStatsInfo.getCount());
assertEquals(msg, 0, (long) fieldStatsInfo.getMissing());
if (allStats) {
assertEquals(msg, 248.0, fieldStatsInfo.getSum());
assertEquals(msg, 15294.0, fieldStatsInfo.getSumOfSquares(), 0.1E-7);
assertEquals(msg, 24.8, (double) fieldStatsInfo.getMean(), 0.1E-7);
assertEquals(msg, 31.87405772027709, fieldStatsInfo.getStddev(), 0.1E-7);
} else {
assertNull(msg, fieldStatsInfo.getSum());
assertNull(msg, fieldStatsInfo.getSumOfSquares());
assertNull(msg, fieldStatsInfo.getMean());
assertNull(msg, fieldStatsInfo.getStddev());
}
if (params.getBool("facet", false)) {
// if this was a facet request, then the top pivot constraint and pivot
// stats should match what we expect - regardless of whether refine
// was used, or if the query was initially satisfied by the default overrequest
List<PivotField> placePivots = rsp.getFacetPivot().get("place_t,company_t");
assertEquals(1, placePivots.size());
PivotField dublinPivotField = placePivots.get(0);
assertEquals("dublin", dublinPivotField.getValue());
assertEquals(4, dublinPivotField.getCount());
assertEquals(1, dublinPivotField.getPivot().size());
PivotField microsoftPivotField = dublinPivotField.getPivot().get(0);
assertEquals("microsoft", microsoftPivotField.getValue());
assertEquals(4, microsoftPivotField.getCount());
FieldStatsInfo dublinMicrosoftStatsInfo = microsoftPivotField.getFieldStatsInfo().get("foo_i");
assertEquals(3.0D, dublinMicrosoftStatsInfo.getMin());
assertEquals(91.0D, dublinMicrosoftStatsInfo.getMax());
assertEquals(4, (long) dublinMicrosoftStatsInfo.getCount());
assertEquals(0, (long) dublinMicrosoftStatsInfo.getMissing());
if (!allStats) {
assertNull(msg, dublinMicrosoftStatsInfo.getSum());
assertNull(msg, dublinMicrosoftStatsInfo.getSumOfSquares());
assertNull(msg, dublinMicrosoftStatsInfo.getMean());
assertNull(msg, dublinMicrosoftStatsInfo.getStddev());
}
}
}
// sanity check that the top pivot from each shard is diff, to prove to
// ourselves that the above queries really must have involved refinement.
Object s0pivValue = clients.get(0).query(facetParams).getFacetPivot().get("place_t,company_t").get(0).getValue();
Object s1pivValue = clients.get(1).query(facetParams).getFacetPivot().get("place_t,company_t").get(0).getValue();
assertFalse("both shards have same top constraint, test is invalid" + "(did someone change the test data?) ==> " + s0pivValue + "==" + s1pivValue, s0pivValue.equals(s1pivValue));
}
use of org.apache.solr.client.solrj.response.FieldStatsInfo in project lucene-solr by apache.
the class TestDistributedStatsComponentCardinality method test.
public void test() throws Exception {
buildIndex();
{
// simple sanity checks - don't leak variables
QueryResponse rsp = null;
rsp = query(params("rows", "0", "q", "id:42"));
assertEquals(1, rsp.getResults().getNumFound());
rsp = query(params("rows", "0", "q", "*:*", "stats", "true", "stats.field", "{!min=true max=true}long_l"));
assertEquals(NUM_DOCS, rsp.getResults().getNumFound());
assertEquals(MIN_LONG, Math.round((double) rsp.getFieldStatsInfo().get("long_l").getMin()));
assertEquals(MAX_LONG, Math.round((double) rsp.getFieldStatsInfo().get("long_l").getMax()));
}
final int NUM_QUERIES = atLeast(100);
// Some Randomized queries with randomized log2m and max regwidth
for (int i = 0; i < NUM_QUERIES; i++) {
// testing shows that on random data, at the size we're dealing with,
// MINIMUM_LOG2M_PARAM is just too absurdly small to give anything remotely close the
// the theoretically expected relative error.
//
// So we have to use a slightly higher lower bound on what log2m values we randomly test
final int log2m = TestUtil.nextInt(random(), 2 + HLL.MINIMUM_LOG2M_PARAM, HLL.MAXIMUM_LOG2M_PARAM);
// use max regwidth to try and prevent hash collisions from introducing problems
final int regwidth = HLL.MAXIMUM_REGWIDTH_PARAM;
final int lowId = TestUtil.nextInt(random(), 1, NUM_DOCS - 2000);
final int highId = TestUtil.nextInt(random(), lowId + 1000, NUM_DOCS);
final int numMatches = 1 + highId - lowId;
SolrParams p = buildCardinalityQ(lowId, highId, log2m, regwidth);
QueryResponse rsp = query(p);
assertEquals("sanity check num matches, p=" + p, numMatches, rsp.getResults().getNumFound());
Map<String, FieldStatsInfo> stats = rsp.getFieldStatsInfo();
for (String f : STAT_FIELDS) {
// regardless of log2m and regwidth, the estimated cardinality of the
// hashed vs prehashed values should be exactly the same for each field
assertEquals(f + ": hashed vs prehashed, real=" + numMatches + ", p=" + p, stats.get(f).getCardinality().longValue(), stats.get(f + "_prehashed_l").getCardinality().longValue());
}
for (String f : STAT_FIELDS) {
// check the relative error of the estimate returned against the known truth
final double relErr = expectedRelativeError(log2m);
final long estimate = stats.get(f).getCardinality().longValue();
assertTrue(f + ": relativeErr=" + relErr + ", estimate=" + estimate + ", real=" + numMatches + ", p=" + p, (Math.abs(numMatches - estimate) / numMatches) < relErr);
}
}
// Some Randomized queries with both low and high accuracy options
for (int i = 0; i < NUM_QUERIES; i++) {
final int lowId = TestUtil.nextInt(random(), 1, NUM_DOCS - 2000);
final int highId = TestUtil.nextInt(random(), lowId + 1000, NUM_DOCS);
final int numMatches = 1 + highId - lowId;
// WTF? - https://github.com/aggregateknowledge/java-hll/issues/15
//
// aparently we can't rely on estimates always being more accurate with higher log2m values?
// so for now, just try testing accuracy values that differ by at least 0.5
//
// (that should give us a significant enough log2m diff that the "highAccuracy" is always
// more accurate -- if, not then the entire premise of the float value is fundementally bogus)
//
final double lowAccuracy = random().nextDouble() / 2;
// final double highAccuracy = Math.min(1.0D, lowAccuracy + (random().nextDouble() / 2));
final double highAccuracy = Math.min(1.0D, lowAccuracy + 0.5D);
SolrParams p = buildCardinalityQ(lowId, highId, lowAccuracy, highAccuracy);
QueryResponse rsp = query(p);
assertEquals("sanity check num matches, p=" + p, numMatches, rsp.getResults().getNumFound());
Map<String, FieldStatsInfo> stats = rsp.getFieldStatsInfo();
// w/the prehashed long...
for (String f : new String[] { "long_l", "string_s" }) {
// regardless of accuracy, the estimated cardinality of the
// hashed vs prehashed values should be exactly the same for each field
assertEquals(f + ": hashed vs prehashed (low), real=" + numMatches + ", p=" + p, stats.get("low_" + f).getCardinality().longValue(), stats.get("low_" + f + "_prehashed_l").getCardinality().longValue());
assertEquals(f + ": hashed vs prehashed (high), real=" + numMatches + ", p=" + p, stats.get("high_" + f).getCardinality().longValue(), stats.get("high_" + f + "_prehashed_l").getCardinality().longValue());
}
for (String f : STAT_FIELDS) {
for (String ff : new String[] { f, f + "_prehashed_l" }) {
// for both the prehashed and regular fields, the high accuracy option
// should always produce an estimate at least as good as the low accuracy option
long poorEst = stats.get("low_" + ff).getCardinality();
long goodEst = stats.get("high_" + ff).getCardinality();
assertTrue(ff + ": goodEst=" + goodEst + ", poorEst=" + poorEst + ", real=" + numMatches + ", p=" + p, Math.abs(numMatches - goodEst) <= Math.abs(numMatches - poorEst));
}
}
}
}
Aggregations