Examples with Similarity - org.apache.lucene.search.similarities.Similarity

Example 41 with Similarity

use of org.apache.lucene.search.similarities.Similarity in project lucene-solr by apache.

the class TestDocValuesScoring method testSimple.

/* for comparing floats */
public void testSimple() throws Exception {
    Directory dir = newDirectory();
    RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
    Document doc = new Document();
    Field field = newTextField("foo", "", Field.Store.NO);
    doc.add(field);
    Field dvField = new FloatDocValuesField("foo_boost", 0.0F);
    doc.add(dvField);
    Field field2 = newTextField("bar", "", Field.Store.NO);
    doc.add(field2);
    field.setStringValue("quick brown fox");
    field2.setStringValue("quick brown fox");
    // boost x2
    dvField.setFloatValue(2f);
    iw.addDocument(doc);
    field.setStringValue("jumps over lazy brown dog");
    field2.setStringValue("jumps over lazy brown dog");
    // boost x4
    dvField.setFloatValue(4f);
    iw.addDocument(doc);
    IndexReader ir = iw.getReader();
    iw.close();
    // no boosting
    IndexSearcher searcher1 = newSearcher(ir, false);
    final Similarity base = searcher1.getSimilarity(true);
    // boosting
    IndexSearcher searcher2 = newSearcher(ir, false);
    searcher2.setSimilarity(new PerFieldSimilarityWrapper() {

        final Similarity fooSim = new BoostingSimilarity(base, "foo_boost");

        @Override
        public Similarity get(String field) {
            return "foo".equals(field) ? fooSim : base;
        }
    });
    // in this case, we searched on field "foo". first document should have 2x the score.
    TermQuery tq = new TermQuery(new Term("foo", "quick"));
    QueryUtils.check(random(), tq, searcher1);
    QueryUtils.check(random(), tq, searcher2);
    TopDocs noboost = searcher1.search(tq, 10);
    TopDocs boost = searcher2.search(tq, 10);
    assertEquals(1, noboost.totalHits);
    assertEquals(1, boost.totalHits);
    //System.out.println(searcher2.explain(tq, boost.scoreDocs[0].doc));
    assertEquals(boost.scoreDocs[0].score, noboost.scoreDocs[0].score * 2f, SCORE_EPSILON);
    // this query matches only the second document, which should have 4x the score.
    tq = new TermQuery(new Term("foo", "jumps"));
    QueryUtils.check(random(), tq, searcher1);
    QueryUtils.check(random(), tq, searcher2);
    noboost = searcher1.search(tq, 10);
    boost = searcher2.search(tq, 10);
    assertEquals(1, noboost.totalHits);
    assertEquals(1, boost.totalHits);
    assertEquals(boost.scoreDocs[0].score, noboost.scoreDocs[0].score * 4f, SCORE_EPSILON);
    // search on on field bar just for kicks, nothing should happen, since we setup
    // our sim provider to only use foo_boost for field foo.
    tq = new TermQuery(new Term("bar", "quick"));
    QueryUtils.check(random(), tq, searcher1);
    QueryUtils.check(random(), tq, searcher2);
    noboost = searcher1.search(tq, 10);
    boost = searcher2.search(tq, 10);
    assertEquals(1, noboost.totalHits);
    assertEquals(1, boost.totalHits);
    assertEquals(boost.scoreDocs[0].score, noboost.scoreDocs[0].score, SCORE_EPSILON);
    ir.close();
    dir.close();
}

Also used : Similarity(org.apache.lucene.search.similarities.Similarity) FloatDocValuesField(org.apache.lucene.document.FloatDocValuesField) Term(org.apache.lucene.index.Term) Document(org.apache.lucene.document.Document) FloatDocValuesField(org.apache.lucene.document.FloatDocValuesField) Field(org.apache.lucene.document.Field) IndexReader(org.apache.lucene.index.IndexReader) PerFieldSimilarityWrapper(org.apache.lucene.search.similarities.PerFieldSimilarityWrapper) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) Directory(org.apache.lucene.store.Directory)

Example 42 with Similarity

use of org.apache.lucene.search.similarities.Similarity in project lucene-solr by apache.

the class TestNorms method buildIndex.

// TODO: create a testNormsNotPresent ourselves by adding/deleting/merging docs
public void buildIndex(Directory dir) throws IOException {
    Random random = random();
    MockAnalyzer analyzer = new MockAnalyzer(random());
    analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH));
    IndexWriterConfig config = newIndexWriterConfig(analyzer);
    Similarity provider = new MySimProvider();
    config.setSimilarity(provider);
    RandomIndexWriter writer = new RandomIndexWriter(random, dir, config);
    final LineFileDocs docs = new LineFileDocs(random);
    int num = atLeast(100);
    for (int i = 0; i < num; i++) {
        Document doc = docs.nextDoc();
        int boost = TestUtil.nextInt(random, 1, 255);
        String value = IntStream.range(0, boost).mapToObj(k -> Integer.toString(boost)).collect(Collectors.joining(" "));
        Field f = new TextField(BYTE_TEST_FIELD, value, Field.Store.YES);
        doc.add(f);
        writer.addDocument(doc);
        doc.removeField(BYTE_TEST_FIELD);
        if (rarely()) {
            writer.commit();
        }
    }
    writer.commit();
    writer.close();
    docs.close();
}

Also used : IntStream(java.util.stream.IntStream) ClassicSimilarity(org.apache.lucene.search.similarities.ClassicSimilarity) SuppressCodecs(org.apache.lucene.util.LuceneTestCase.SuppressCodecs) Slow(org.apache.lucene.util.LuceneTestCase.Slow) TestUtil(org.apache.lucene.util.TestUtil) IOException(java.io.IOException) Random(java.util.Random) PerFieldSimilarityWrapper(org.apache.lucene.search.similarities.PerFieldSimilarityWrapper) Collectors(java.util.stream.Collectors) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Document(org.apache.lucene.document.Document) CollectionStatistics(org.apache.lucene.search.CollectionStatistics) LineFileDocs(org.apache.lucene.util.LineFileDocs) Field(org.apache.lucene.document.Field) Similarity(org.apache.lucene.search.similarities.Similarity) Directory(org.apache.lucene.store.Directory) Store(org.apache.lucene.document.Field.Store) LuceneTestCase(org.apache.lucene.util.LuceneTestCase) TextField(org.apache.lucene.document.TextField) TermStatistics(org.apache.lucene.search.TermStatistics) ClassicSimilarity(org.apache.lucene.search.similarities.ClassicSimilarity) Similarity(org.apache.lucene.search.similarities.Similarity) Document(org.apache.lucene.document.Document) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) Random(java.util.Random) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) TextField(org.apache.lucene.document.TextField) LineFileDocs(org.apache.lucene.util.LineFileDocs)

Example 43 with Similarity

use of org.apache.lucene.search.similarities.Similarity in project lucene-solr by apache.

the class TestDiversifiedTopDocsCollector method setUp.

@Override
public void setUp() throws Exception {
    super.setUp();
    // populate an index with documents - artist, song and weeksAtNumberOne
    dir = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
    Document doc = new Document();
    Field yearField = newTextField("year", "", Field.Store.NO);
    SortedDocValuesField artistField = new SortedDocValuesField("artist", new BytesRef(""));
    Field weeksAtNumberOneField = new FloatDocValuesField("weeksAtNumberOne", 0.0F);
    Field weeksStoredField = new StoredField("weeks", 0.0F);
    Field idField = newStringField("id", "", Field.Store.YES);
    Field songField = newTextField("song", "", Field.Store.NO);
    Field storedArtistField = newTextField("artistName", "", Field.Store.NO);
    doc.add(idField);
    doc.add(weeksAtNumberOneField);
    doc.add(storedArtistField);
    doc.add(songField);
    doc.add(weeksStoredField);
    doc.add(yearField);
    doc.add(artistField);
    parsedRecords.clear();
    for (int i = 0; i < hitsOfThe60s.length; i++) {
        String[] cols = hitsOfThe60s[i].split("\t");
        Record record = new Record(String.valueOf(i), cols[0], cols[1], cols[2], Float.parseFloat(cols[3]));
        parsedRecords.put(record.id, record);
        idField.setStringValue(record.id);
        yearField.setStringValue(record.year);
        storedArtistField.setStringValue(record.artist);
        artistField.setBytesValue(new BytesRef(record.artist));
        songField.setStringValue(record.song);
        weeksStoredField.setFloatValue(record.weeks);
        weeksAtNumberOneField.setFloatValue(record.weeks);
        writer.addDocument(doc);
        if (i % 10 == 0) {
            // Causes the creation of multiple segments for our test
            writer.commit();
        }
    }
    reader = writer.getReader();
    writer.close();
    searcher = newSearcher(reader);
    artistDocValues = MultiDocValues.getSortedValues(reader, "artist");
    // All searches sort by song popularity 
    final Similarity base = searcher.getSimilarity(true);
    searcher.setSimilarity(new DocValueSimilarity(base, "weeksAtNumberOne"));
}

Also used : Similarity(org.apache.lucene.search.similarities.Similarity) FloatDocValuesField(org.apache.lucene.document.FloatDocValuesField) Document(org.apache.lucene.document.Document) FloatDocValuesField(org.apache.lucene.document.FloatDocValuesField) StoredField(org.apache.lucene.document.StoredField) SortedDocValuesField(org.apache.lucene.document.SortedDocValuesField) Field(org.apache.lucene.document.Field) StoredField(org.apache.lucene.document.StoredField) SortedDocValuesField(org.apache.lucene.document.SortedDocValuesField) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) BytesRef(org.apache.lucene.util.BytesRef)

Example 44 with Similarity

use of org.apache.lucene.search.similarities.Similarity in project lucene-solr by apache.

the class TestBulkSchemaAPI method assertFieldSimilarity.

/**
   * whitebox checks the Similarity for the specified field according to {@link SolrCore#getLatestSchema}
   * 
   * Executes each of the specified Similarity-accepting validators.
   */
@SafeVarargs
private static <T extends Similarity> void assertFieldSimilarity(String fieldname, Class<T> expected, Consumer<T>... validators) {
    CoreContainer cc = jetty.getCoreContainer();
    try (SolrCore core = cc.getCore("collection1")) {
        SimilarityFactory simfac = core.getLatestSchema().getSimilarityFactory();
        assertNotNull(simfac);
        assertTrue("test only works with SchemaSimilarityFactory", simfac instanceof SchemaSimilarityFactory);
        Similarity mainSim = core.getLatestSchema().getSimilarity();
        assertNotNull(mainSim);
        // sanity check simfac vs sim in use - also verify infom called on simfac, otherwise exception
        assertEquals(mainSim, simfac.getSimilarity());
        assertTrue("test only works with PerFieldSimilarityWrapper, SchemaSimilarityFactory redefined?", mainSim instanceof PerFieldSimilarityWrapper);
        Similarity fieldSim = ((PerFieldSimilarityWrapper) mainSim).get(fieldname);
        assertEquals("wrong sim for field=" + fieldname, expected, fieldSim.getClass());
        Arrays.asList(validators).forEach(v -> v.accept((T) fieldSim));
    }
}

Also used : CoreContainer(org.apache.solr.core.CoreContainer) SweetSpotSimilarity(org.apache.lucene.misc.SweetSpotSimilarity) Similarity(org.apache.lucene.search.similarities.Similarity) DFISimilarity(org.apache.lucene.search.similarities.DFISimilarity) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity) SolrCore(org.apache.solr.core.SolrCore) PerFieldSimilarityWrapper(org.apache.lucene.search.similarities.PerFieldSimilarityWrapper) SchemaSimilarityFactory(org.apache.solr.search.similarities.SchemaSimilarityFactory) SimilarityFactory(org.apache.solr.schema.SimilarityFactory) SchemaSimilarityFactory(org.apache.solr.search.similarities.SchemaSimilarityFactory)

Example 45 with Similarity

use of org.apache.lucene.search.similarities.Similarity in project lucene-solr by apache.

the class ChangedSchemaMergeTest method testSanityOfSchemaSimilarityFactoryInform.

public void testSanityOfSchemaSimilarityFactoryInform() {
    // sanity check that SchemaSimilarityFactory will throw an Exception if you
    // try to use it w/o inform(SolrCoreAware) otherwise assertSimilarity is useless
    SchemaSimilarityFactory broken = new SchemaSimilarityFactory();
    broken.init(new ModifiableSolrParams());
    // NO INFORM
    try {
        Similarity bogus = broken.getSimilarity();
        fail("SchemaSimilarityFactory should have thrown IllegalStateException b/c inform not used");
    } catch (IllegalStateException expected) {
        assertTrue("GOT: " + expected.getMessage(), expected.getMessage().contains("SolrCoreAware.inform"));
    }
}

Also used : Similarity(org.apache.lucene.search.similarities.Similarity) SchemaSimilarityFactory(org.apache.solr.search.similarities.SchemaSimilarityFactory) ModifiableSolrParams(org.apache.solr.common.params.ModifiableSolrParams)

Aggregations

Similarity (org.apache.lucene.search.similarities.Similarity)49 ClassicSimilarity (org.apache.lucene.search.similarities.ClassicSimilarity)16 BM25Similarity (org.apache.lucene.search.similarities.BM25Similarity)15 Directory (org.apache.lucene.store.Directory)9 PerFieldSimilarityWrapper (org.apache.lucene.search.similarities.PerFieldSimilarityWrapper)8 SweetSpotSimilarity (org.apache.lucene.misc.SweetSpotSimilarity)7 IOException (java.io.IOException)6 IndexSearcher (org.apache.lucene.search.IndexSearcher)6 Document (org.apache.lucene.document.Document)5 Term (org.apache.lucene.index.Term)5 Collectors (java.util.stream.Collectors)4 IntStream (java.util.stream.IntStream)4 Field (org.apache.lucene.document.Field)4 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)4 NormValueSource (org.apache.lucene.queries.function.valuesource.NormValueSource)4 TermQuery (org.apache.lucene.search.TermQuery)4 BytesRef (org.apache.lucene.util.BytesRef)4 ScoredDocuments (io.anserini.rerank.ScoredDocuments)3 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)3 Store (org.apache.lucene.document.Field.Store)3