use of org.apache.lucene.facet.taxonomy.TaxonomyReader in project lucene-solr by apache.
the class TestRangeFacetCounts method testMixedRangeAndNonRangeTaxonomy.
/** Tests single request that mixes Range and non-Range
* faceting, with DrillSideways and taxonomy. */
public void testMixedRangeAndNonRangeTaxonomy() throws Exception {
Directory d = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), d);
Directory td = newDirectory();
DirectoryTaxonomyWriter tw = new DirectoryTaxonomyWriter(td, IndexWriterConfig.OpenMode.CREATE);
FacetsConfig config = new FacetsConfig();
for (long l = 0; l < 100; l++) {
Document doc = new Document();
// For computing range facet counts:
doc.add(new NumericDocValuesField("field", l));
// For drill down by numeric range:
doc.add(new LongPoint("field", l));
if ((l & 3) == 0) {
doc.add(new FacetField("dim", "a"));
} else {
doc.add(new FacetField("dim", "b"));
}
w.addDocument(config.build(tw, doc));
}
final IndexReader r = w.getReader();
final TaxonomyReader tr = new DirectoryTaxonomyReader(tw);
IndexSearcher s = newSearcher(r, false);
if (VERBOSE) {
System.out.println("TEST: searcher=" + s);
}
DrillSideways ds = new DrillSideways(s, config, tr) {
@Override
protected Facets buildFacetsResult(FacetsCollector drillDowns, FacetsCollector[] drillSideways, String[] drillSidewaysDims) throws IOException {
FacetsCollector dimFC = drillDowns;
FacetsCollector fieldFC = drillDowns;
if (drillSideways != null) {
for (int i = 0; i < drillSideways.length; i++) {
String dim = drillSidewaysDims[i];
if (dim.equals("field")) {
fieldFC = drillSideways[i];
} else {
dimFC = drillSideways[i];
}
}
}
Map<String, Facets> byDim = new HashMap<>();
byDim.put("field", new LongRangeFacetCounts("field", fieldFC, new LongRange("less than 10", 0L, true, 10L, false), new LongRange("less than or equal to 10", 0L, true, 10L, true), new LongRange("over 90", 90L, false, 100L, false), new LongRange("90 or above", 90L, true, 100L, false), new LongRange("over 1000", 1000L, false, Long.MAX_VALUE, false)));
byDim.put("dim", getTaxonomyFacetCounts(taxoReader, config, dimFC));
return new MultiFacets(byDim, null);
}
@Override
protected boolean scoreSubDocsAtOnce() {
return random().nextBoolean();
}
};
// First search, no drill downs:
DrillDownQuery ddq = new DrillDownQuery(config);
DrillSidewaysResult dsr = ds.search(null, ddq, 10);
assertEquals(100, dsr.hits.totalHits);
assertEquals("dim=dim path=[] value=100 childCount=2\n b (75)\n a (25)\n", dsr.facets.getTopChildren(10, "dim").toString());
assertEquals("dim=field path=[] value=21 childCount=5\n less than 10 (10)\n less than or equal to 10 (11)\n over 90 (9)\n 90 or above (10)\n over 1000 (0)\n", dsr.facets.getTopChildren(10, "field").toString());
// Second search, drill down on dim=b:
ddq = new DrillDownQuery(config);
ddq.add("dim", "b");
dsr = ds.search(null, ddq, 10);
assertEquals(75, dsr.hits.totalHits);
assertEquals("dim=dim path=[] value=100 childCount=2\n b (75)\n a (25)\n", dsr.facets.getTopChildren(10, "dim").toString());
assertEquals("dim=field path=[] value=16 childCount=5\n less than 10 (7)\n less than or equal to 10 (8)\n over 90 (7)\n 90 or above (8)\n over 1000 (0)\n", dsr.facets.getTopChildren(10, "field").toString());
// Third search, drill down on "less than or equal to 10":
ddq = new DrillDownQuery(config);
ddq.add("field", LongPoint.newRangeQuery("field", 0L, 10L));
dsr = ds.search(null, ddq, 10);
assertEquals(11, dsr.hits.totalHits);
assertEquals("dim=dim path=[] value=11 childCount=2\n b (8)\n a (3)\n", dsr.facets.getTopChildren(10, "dim").toString());
assertEquals("dim=field path=[] value=21 childCount=5\n less than 10 (10)\n less than or equal to 10 (11)\n over 90 (9)\n 90 or above (10)\n over 1000 (0)\n", dsr.facets.getTopChildren(10, "field").toString());
w.close();
IOUtils.close(tw, tr, td, r, d);
}
use of org.apache.lucene.facet.taxonomy.TaxonomyReader in project lucene-solr by apache.
the class TestMultipleIndexFields method testTwoCustomsSameField.
@Test
public void testTwoCustomsSameField() throws Exception {
Directory indexDir = newDirectory();
Directory taxoDir = newDirectory();
// create and open an index writer
RandomIndexWriter iw = new RandomIndexWriter(random(), indexDir, newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)));
// create and open a taxonomy writer
TaxonomyWriter tw = new DirectoryTaxonomyWriter(taxoDir, OpenMode.CREATE);
FacetsConfig config = getConfig();
config.setIndexFieldName("Band", "$music");
config.setIndexFieldName("Composer", "$music");
seedIndex(tw, iw, config);
IndexReader ir = iw.getReader();
tw.commit();
// prepare index reader and taxonomy.
TaxonomyReader tr = new DirectoryTaxonomyReader(taxoDir);
// prepare searcher to search against
IndexSearcher searcher = newSearcher(ir);
FacetsCollector sfc = performSearch(tr, ir, searcher);
Map<String, Facets> facetsMap = new HashMap<>();
Facets facets2 = getTaxonomyFacetCounts(tr, config, sfc, "$music");
facetsMap.put("Band", facets2);
facetsMap.put("Composer", facets2);
Facets facets = new MultiFacets(facetsMap, getTaxonomyFacetCounts(tr, config, sfc));
// Obtain facets results and hand-test them
assertCorrectResults(facets);
assertOrdinalsExist("$facets", ir);
assertOrdinalsExist("$music", ir);
assertOrdinalsExist("$music", ir);
iw.close();
IOUtils.close(tr, ir, tw, indexDir, taxoDir);
}
use of org.apache.lucene.facet.taxonomy.TaxonomyReader in project lucene-solr by apache.
the class TestMultipleIndexFields method testDefault.
@Test
public void testDefault() throws Exception {
Directory indexDir = newDirectory();
Directory taxoDir = newDirectory();
// create and open an index writer
RandomIndexWriter iw = new RandomIndexWriter(random(), indexDir, newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)));
// create and open a taxonomy writer
TaxonomyWriter tw = new DirectoryTaxonomyWriter(taxoDir, OpenMode.CREATE);
FacetsConfig config = getConfig();
seedIndex(tw, iw, config);
IndexReader ir = iw.getReader();
tw.commit();
// prepare index reader and taxonomy.
TaxonomyReader tr = new DirectoryTaxonomyReader(taxoDir);
// prepare searcher to search against
IndexSearcher searcher = newSearcher(ir);
FacetsCollector sfc = performSearch(tr, ir, searcher);
// Obtain facets results and hand-test them
assertCorrectResults(getTaxonomyFacetCounts(tr, config, sfc));
assertOrdinalsExist("$facets", ir);
iw.close();
IOUtils.close(tr, ir, tw, indexDir, taxoDir);
}
use of org.apache.lucene.facet.taxonomy.TaxonomyReader in project lucene-solr by apache.
the class TestRandomSamplingFacetsCollector method testRandomSampling.
public void testRandomSampling() throws Exception {
Directory dir = newDirectory();
Directory taxoDir = newDirectory();
Random random = random();
DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir);
RandomIndexWriter writer = new RandomIndexWriter(random, dir);
FacetsConfig config = new FacetsConfig();
final int numCategories = 10;
int numDocs = atLeast(10000);
for (int i = 0; i < numDocs; i++) {
Document doc = new Document();
doc.add(new StringField("EvenOdd", (i % 2 == 0) ? "even" : "odd", Store.NO));
doc.add(new FacetField("iMod10", Integer.toString(i % numCategories)));
writer.addDocument(config.build(taxoWriter, doc));
}
writer.forceMerge(CHI_SQUARE_VALUES.length - 1);
// NRT open
IndexSearcher searcher = newSearcher(writer.getReader());
TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter);
IOUtils.close(writer, taxoWriter);
// Test empty results
RandomSamplingFacetsCollector collectRandomZeroResults = new RandomSamplingFacetsCollector(numDocs / 10, random.nextLong());
// There should be no divisions by zero
searcher.search(new TermQuery(new Term("EvenOdd", "NeverMatches")), collectRandomZeroResults);
// There should be no divisions by zero and no null result
assertNotNull(collectRandomZeroResults.getMatchingDocs());
// There should be no results at all
for (MatchingDocs doc : collectRandomZeroResults.getMatchingDocs()) {
assertEquals(0, doc.totalHits);
}
// Now start searching and retrieve results.
// Use a query to select half of the documents.
TermQuery query = new TermQuery(new Term("EvenOdd", "even"));
// 10% of total docs, 20% of the hits
RandomSamplingFacetsCollector random10Percent = new RandomSamplingFacetsCollector(numDocs / 10, random.nextLong());
FacetsCollector fc = new FacetsCollector();
searcher.search(query, MultiCollector.wrap(fc, random10Percent));
final List<MatchingDocs> matchingDocs = random10Percent.getMatchingDocs();
// count the total hits and sampled docs, also store the number of sampled
// docs per segment
int totalSampledDocs = 0, totalHits = 0;
int[] numSampledDocs = new int[matchingDocs.size()];
// System.out.println("numSegments=" + numSampledDocs.length);
for (int i = 0; i < numSampledDocs.length; i++) {
MatchingDocs md = matchingDocs.get(i);
final DocIdSetIterator iter = md.bits.iterator();
while (iter.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) ++numSampledDocs[i];
totalSampledDocs += numSampledDocs[i];
totalHits += md.totalHits;
}
// compute the chi-square value for the sampled documents' distribution
float chi_square = 0;
for (int i = 0; i < numSampledDocs.length; i++) {
MatchingDocs md = matchingDocs.get(i);
float ei = (float) md.totalHits / totalHits;
if (ei > 0.0f) {
float oi = (float) numSampledDocs[i] / totalSampledDocs;
chi_square += (Math.pow(ei - oi, 2) / ei);
}
}
// Verify that the chi-square value isn't too big. According to
// http://en.wikipedia.org/wiki/Chi-squared_distribution#Table_of_.CF.872_value_vs_p-value,
// we basically verify that there is a really small chance of hitting a very
// bad sample (p-value < 0.05), for n-degrees of freedom. The number 'n' depends
// on the number of segments.
assertTrue("chisquare not statistically significant enough: " + chi_square, chi_square < CHI_SQUARE_VALUES[numSampledDocs.length]);
// Test amortized counts - should be 5X the sampled count, but maximum numDocs/10
final FastTaxonomyFacetCounts random10FacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, random10Percent);
final FacetResult random10Result = random10FacetCounts.getTopChildren(10, "iMod10");
final FacetResult amortized10Result = random10Percent.amortizeFacetCounts(random10Result, config, searcher);
for (int i = 0; i < amortized10Result.labelValues.length; i++) {
LabelAndValue amortized = amortized10Result.labelValues[i];
LabelAndValue sampled = random10Result.labelValues[i];
// since numDocs may not divide by 10 exactly, allow for some slack in the amortized count
assertEquals(amortized.value.floatValue(), Math.min(5 * sampled.value.floatValue(), numDocs / 10.f), 1.0);
}
IOUtils.close(searcher.getIndexReader(), taxoReader, dir, taxoDir);
}
use of org.apache.lucene.facet.taxonomy.TaxonomyReader in project lucene-solr by apache.
the class TestMultipleIndexFields method testDifferentFieldsAndText.
@Test
public void testDifferentFieldsAndText() throws Exception {
Directory indexDir = newDirectory();
Directory taxoDir = newDirectory();
// create and open an index writer
RandomIndexWriter iw = new RandomIndexWriter(random(), indexDir, newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)));
// create and open a taxonomy writer
TaxonomyWriter tw = new DirectoryTaxonomyWriter(taxoDir, OpenMode.CREATE);
FacetsConfig config = getConfig();
config.setIndexFieldName("Band", "$bands");
config.setIndexFieldName("Composer", "$composers");
seedIndex(tw, iw, config);
IndexReader ir = iw.getReader();
tw.commit();
// prepare index reader and taxonomy.
TaxonomyReader tr = new DirectoryTaxonomyReader(taxoDir);
// prepare searcher to search against
IndexSearcher searcher = newSearcher(ir);
FacetsCollector sfc = performSearch(tr, ir, searcher);
Map<String, Facets> facetsMap = new HashMap<>();
facetsMap.put("Band", getTaxonomyFacetCounts(tr, config, sfc, "$bands"));
facetsMap.put("Composer", getTaxonomyFacetCounts(tr, config, sfc, "$composers"));
Facets facets = new MultiFacets(facetsMap, getTaxonomyFacetCounts(tr, config, sfc));
// Obtain facets results and hand-test them
assertCorrectResults(facets);
assertOrdinalsExist("$facets", ir);
assertOrdinalsExist("$bands", ir);
assertOrdinalsExist("$composers", ir);
iw.close();
IOUtils.close(tr, ir, tw, indexDir, taxoDir);
}
Aggregations