use of org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter in project lucene-solr by apache.
the class TestMultipleIndexFields method testDefault.
@Test
public void testDefault() throws Exception {
Directory indexDir = newDirectory();
Directory taxoDir = newDirectory();
// create and open an index writer
RandomIndexWriter iw = new RandomIndexWriter(random(), indexDir, newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)));
// create and open a taxonomy writer
TaxonomyWriter tw = new DirectoryTaxonomyWriter(taxoDir, OpenMode.CREATE);
FacetsConfig config = getConfig();
seedIndex(tw, iw, config);
IndexReader ir = iw.getReader();
tw.commit();
// prepare index reader and taxonomy.
TaxonomyReader tr = new DirectoryTaxonomyReader(taxoDir);
// prepare searcher to search against
IndexSearcher searcher = newSearcher(ir);
FacetsCollector sfc = performSearch(tr, ir, searcher);
// Obtain facets results and hand-test them
assertCorrectResults(getTaxonomyFacetCounts(tr, config, sfc));
assertOrdinalsExist("$facets", ir);
iw.close();
IOUtils.close(tr, ir, tw, indexDir, taxoDir);
}
use of org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter in project lucene-solr by apache.
the class TestRandomSamplingFacetsCollector method testRandomSampling.
public void testRandomSampling() throws Exception {
Directory dir = newDirectory();
Directory taxoDir = newDirectory();
Random random = random();
DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir);
RandomIndexWriter writer = new RandomIndexWriter(random, dir);
FacetsConfig config = new FacetsConfig();
final int numCategories = 10;
int numDocs = atLeast(10000);
for (int i = 0; i < numDocs; i++) {
Document doc = new Document();
doc.add(new StringField("EvenOdd", (i % 2 == 0) ? "even" : "odd", Store.NO));
doc.add(new FacetField("iMod10", Integer.toString(i % numCategories)));
writer.addDocument(config.build(taxoWriter, doc));
}
writer.forceMerge(CHI_SQUARE_VALUES.length - 1);
// NRT open
IndexSearcher searcher = newSearcher(writer.getReader());
TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter);
IOUtils.close(writer, taxoWriter);
// Test empty results
RandomSamplingFacetsCollector collectRandomZeroResults = new RandomSamplingFacetsCollector(numDocs / 10, random.nextLong());
// There should be no divisions by zero
searcher.search(new TermQuery(new Term("EvenOdd", "NeverMatches")), collectRandomZeroResults);
// There should be no divisions by zero and no null result
assertNotNull(collectRandomZeroResults.getMatchingDocs());
// There should be no results at all
for (MatchingDocs doc : collectRandomZeroResults.getMatchingDocs()) {
assertEquals(0, doc.totalHits);
}
// Now start searching and retrieve results.
// Use a query to select half of the documents.
TermQuery query = new TermQuery(new Term("EvenOdd", "even"));
// 10% of total docs, 20% of the hits
RandomSamplingFacetsCollector random10Percent = new RandomSamplingFacetsCollector(numDocs / 10, random.nextLong());
FacetsCollector fc = new FacetsCollector();
searcher.search(query, MultiCollector.wrap(fc, random10Percent));
final List<MatchingDocs> matchingDocs = random10Percent.getMatchingDocs();
// count the total hits and sampled docs, also store the number of sampled
// docs per segment
int totalSampledDocs = 0, totalHits = 0;
int[] numSampledDocs = new int[matchingDocs.size()];
// System.out.println("numSegments=" + numSampledDocs.length);
for (int i = 0; i < numSampledDocs.length; i++) {
MatchingDocs md = matchingDocs.get(i);
final DocIdSetIterator iter = md.bits.iterator();
while (iter.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) ++numSampledDocs[i];
totalSampledDocs += numSampledDocs[i];
totalHits += md.totalHits;
}
// compute the chi-square value for the sampled documents' distribution
float chi_square = 0;
for (int i = 0; i < numSampledDocs.length; i++) {
MatchingDocs md = matchingDocs.get(i);
float ei = (float) md.totalHits / totalHits;
if (ei > 0.0f) {
float oi = (float) numSampledDocs[i] / totalSampledDocs;
chi_square += (Math.pow(ei - oi, 2) / ei);
}
}
// Verify that the chi-square value isn't too big. According to
// http://en.wikipedia.org/wiki/Chi-squared_distribution#Table_of_.CF.872_value_vs_p-value,
// we basically verify that there is a really small chance of hitting a very
// bad sample (p-value < 0.05), for n-degrees of freedom. The number 'n' depends
// on the number of segments.
assertTrue("chisquare not statistically significant enough: " + chi_square, chi_square < CHI_SQUARE_VALUES[numSampledDocs.length]);
// Test amortized counts - should be 5X the sampled count, but maximum numDocs/10
final FastTaxonomyFacetCounts random10FacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, random10Percent);
final FacetResult random10Result = random10FacetCounts.getTopChildren(10, "iMod10");
final FacetResult amortized10Result = random10Percent.amortizeFacetCounts(random10Result, config, searcher);
for (int i = 0; i < amortized10Result.labelValues.length; i++) {
LabelAndValue amortized = amortized10Result.labelValues[i];
LabelAndValue sampled = random10Result.labelValues[i];
// since numDocs may not divide by 10 exactly, allow for some slack in the amortized count
assertEquals(amortized.value.floatValue(), Math.min(5 * sampled.value.floatValue(), numDocs / 10.f), 1.0);
}
IOUtils.close(searcher.getIndexReader(), taxoReader, dir, taxoDir);
}
use of org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter in project lucene-solr by apache.
the class TestMultipleIndexFields method testDifferentFieldsAndText.
@Test
public void testDifferentFieldsAndText() throws Exception {
Directory indexDir = newDirectory();
Directory taxoDir = newDirectory();
// create and open an index writer
RandomIndexWriter iw = new RandomIndexWriter(random(), indexDir, newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)));
// create and open a taxonomy writer
TaxonomyWriter tw = new DirectoryTaxonomyWriter(taxoDir, OpenMode.CREATE);
FacetsConfig config = getConfig();
config.setIndexFieldName("Band", "$bands");
config.setIndexFieldName("Composer", "$composers");
seedIndex(tw, iw, config);
IndexReader ir = iw.getReader();
tw.commit();
// prepare index reader and taxonomy.
TaxonomyReader tr = new DirectoryTaxonomyReader(taxoDir);
// prepare searcher to search against
IndexSearcher searcher = newSearcher(ir);
FacetsCollector sfc = performSearch(tr, ir, searcher);
Map<String, Facets> facetsMap = new HashMap<>();
facetsMap.put("Band", getTaxonomyFacetCounts(tr, config, sfc, "$bands"));
facetsMap.put("Composer", getTaxonomyFacetCounts(tr, config, sfc, "$composers"));
Facets facets = new MultiFacets(facetsMap, getTaxonomyFacetCounts(tr, config, sfc));
// Obtain facets results and hand-test them
assertCorrectResults(facets);
assertOrdinalsExist("$facets", ir);
assertOrdinalsExist("$bands", ir);
assertOrdinalsExist("$composers", ir);
iw.close();
IOUtils.close(tr, ir, tw, indexDir, taxoDir);
}
use of org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter in project lucene-solr by apache.
the class TestTaxonomyFacetSumValueSource method testSumScoreAggregator.
public void testSumScoreAggregator() throws Exception {
Directory indexDir = newDirectory();
Directory taxoDir = newDirectory();
DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir);
IndexWriter iw = new IndexWriter(indexDir, newIndexWriterConfig(new MockAnalyzer(random())));
FacetsConfig config = new FacetsConfig();
for (int i = atLeast(30); i > 0; --i) {
Document doc = new Document();
if (random().nextBoolean()) {
// don't match all documents
doc.add(new StringField("f", "v", Field.Store.NO));
}
doc.add(new FacetField("dim", "a"));
iw.addDocument(config.build(taxoWriter, doc));
}
DirectoryReader r = DirectoryReader.open(iw);
DirectoryTaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter);
FacetsCollector fc = new FacetsCollector(true);
BoostQuery csq = new BoostQuery(new ConstantScoreQuery(new MatchAllDocsQuery()), 2f);
TopDocs td = FacetsCollector.search(newSearcher(r), csq, 10, fc);
Facets facets = new TaxonomyFacetSumValueSource(taxoReader, config, fc, DoubleValuesSource.SCORES);
int expected = (int) (td.getMaxScore() * td.totalHits);
assertEquals(expected, facets.getSpecificValue("dim", "a").intValue());
iw.close();
IOUtils.close(taxoWriter, taxoReader, taxoDir, r, indexDir);
}
use of org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter in project lucene-solr by apache.
the class TestTaxonomyFacetSumValueSource method testWithScore.
public void testWithScore() throws Exception {
Directory indexDir = newDirectory();
Directory taxoDir = newDirectory();
DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir);
IndexWriter iw = new IndexWriter(indexDir, newIndexWriterConfig(new MockAnalyzer(random())));
FacetsConfig config = new FacetsConfig();
for (int i = 0; i < 4; i++) {
Document doc = new Document();
doc.add(new NumericDocValuesField("price", (i + 1)));
doc.add(new FacetField("a", Integer.toString(i % 2)));
iw.addDocument(config.build(taxoWriter, doc));
}
DirectoryReader r = DirectoryReader.open(iw);
DirectoryTaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter);
FacetsCollector fc = new FacetsCollector(true);
// score documents by their 'price' field - makes asserting the correct counts for the categories easier
Query q = new FunctionQuery(new LongFieldSource("price"));
FacetsCollector.search(newSearcher(r), q, 10, fc);
Facets facets = new TaxonomyFacetSumValueSource(taxoReader, config, fc, DoubleValuesSource.SCORES);
assertEquals("dim=a path=[] value=10.0 childCount=2\n 1 (6.0)\n 0 (4.0)\n", facets.getTopChildren(10, "a").toString());
iw.close();
IOUtils.close(taxoWriter, taxoReader, taxoDir, r, indexDir);
}
Aggregations