Search in sources :

Example 31 with ClassicSimilarity

use of org.apache.lucene.search.similarities.ClassicSimilarity in project lucene-solr by apache.

the class TestSweetSpotSimilarityFactory method testBaselineParameters.

/** baseline with parameters */
public void testBaselineParameters() throws Exception {
    SweetSpotSimilarity sim = getSimilarity("text_baseline", SweetSpotSimilarity.class);
    ClassicSimilarity d = new ClassicSimilarity();
    // constant up to 6
    for (int i = 1; i <= 6; i++) {
        assertEquals("tf i=" + i, 1.5F, sim.tf(i), 0.0F);
    }
    // less then default sim above 6
    for (int i = 6; i <= 1000; i++) {
        assertTrue("tf: i=" + i + " : s=" + sim.tf(i) + " < d=" + d.tf(i), sim.tf(i) < d.tf(i));
    }
    // norms: plateau from 3-5
    assertEquals("norm 1 == 7", computeNorm(sim, 1), computeNorm(sim, 7), 0.0F);
    assertEquals("norm 2 == 6", computeNorm(sim, 1), computeNorm(sim, 7), 0.0F);
    assertEquals("norm 3", 1.00F, computeNorm(sim, 3), 0.0F);
    assertEquals("norm 4", 1.00F, computeNorm(sim, 4), 0.0F);
    assertEquals("norm 5", 1.00F, computeNorm(sim, 5), 0.0F);
    assertTrue("norm 6 too high: " + computeNorm(sim, 6), computeNorm(sim, 6) < 1.0F);
    assertTrue("norm 7 higher then norm 6", computeNorm(sim, 7) < computeNorm(sim, 6));
    assertEquals("norm 20", 0.25F, computeNorm(sim, 20), 0.0F);
}
Also used : SweetSpotSimilarity(org.apache.lucene.misc.SweetSpotSimilarity) ClassicSimilarity(org.apache.lucene.search.similarities.ClassicSimilarity)

Example 32 with ClassicSimilarity

use of org.apache.lucene.search.similarities.ClassicSimilarity in project lucene-solr by apache.

the class TestBoolean2 method beforeClass.

@BeforeClass
public static void beforeClass() throws Exception {
    // in some runs, test immediate adjacency of matches - in others, force a full bucket gap between docs
    NUM_FILLER_DOCS = random().nextBoolean() ? 0 : BooleanScorer.SIZE;
    PRE_FILLER_DOCS = TestUtil.nextInt(random(), 0, (NUM_FILLER_DOCS / 2));
    if (VERBOSE) {
        System.out.println("TEST: NUM_FILLER_DOCS=" + NUM_FILLER_DOCS + " PRE_FILLER_DOCS=" + PRE_FILLER_DOCS);
    }
    if (NUM_FILLER_DOCS * PRE_FILLER_DOCS > 100000) {
        directory = newFSDirectory(createTempDir());
    } else {
        directory = newDirectory();
    }
    IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
    // randomized codecs are sometimes too costly for this test:
    iwc.setCodec(Codec.forName("Lucene70"));
    iwc.setMergePolicy(newLogMergePolicy());
    RandomIndexWriter writer = new RandomIndexWriter(random(), directory, iwc);
    // we'll make a ton of docs, disable store/norms/vectors
    FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
    ft.setOmitNorms(true);
    Document doc = new Document();
    for (int filler = 0; filler < PRE_FILLER_DOCS; filler++) {
        writer.addDocument(doc);
    }
    for (int i = 0; i < docFields.length; i++) {
        doc.add(new Field(field, docFields[i], ft));
        writer.addDocument(doc);
        doc = new Document();
        for (int filler = 0; filler < NUM_FILLER_DOCS; filler++) {
            writer.addDocument(doc);
        }
    }
    writer.close();
    littleReader = DirectoryReader.open(directory);
    searcher = newSearcher(littleReader);
    // this is intentionally using the baseline sim, because it compares against bigSearcher (which uses a random one)
    searcher.setSimilarity(new ClassicSimilarity());
    // make a copy of our index using a single segment
    if (NUM_FILLER_DOCS * PRE_FILLER_DOCS > 100000) {
        singleSegmentDirectory = newFSDirectory(createTempDir());
    } else {
        singleSegmentDirectory = newDirectory();
    }
    // TODO: this test does not need to be doing this crazy stuff. please improve it!
    for (String fileName : directory.listAll()) {
        if (fileName.startsWith("extra")) {
            continue;
        }
        singleSegmentDirectory.copyFrom(directory, fileName, fileName, IOContext.DEFAULT);
        singleSegmentDirectory.sync(Collections.singleton(fileName));
    }
    iwc = newIndexWriterConfig(new MockAnalyzer(random()));
    // we need docID order to be preserved:
    // randomized codecs are sometimes too costly for this test:
    iwc.setCodec(Codec.forName("Lucene70"));
    iwc.setMergePolicy(newLogMergePolicy());
    try (IndexWriter w = new IndexWriter(singleSegmentDirectory, iwc)) {
        w.forceMerge(1, true);
    }
    singleSegmentReader = DirectoryReader.open(singleSegmentDirectory);
    singleSegmentSearcher = newSearcher(singleSegmentReader);
    singleSegmentSearcher.setSimilarity(searcher.getSimilarity(true));
    // Make big index
    dir2 = copyOf(directory);
    // First multiply small test index:
    mulFactor = 1;
    int docCount = 0;
    if (VERBOSE) {
        System.out.println("\nTEST: now copy index...");
    }
    do {
        if (VERBOSE) {
            System.out.println("\nTEST: cycle...");
        }
        final Directory copy = copyOf(dir2);
        iwc = newIndexWriterConfig(new MockAnalyzer(random()));
        // randomized codecs are sometimes too costly for this test:
        iwc.setCodec(Codec.forName("Lucene70"));
        RandomIndexWriter w = new RandomIndexWriter(random(), dir2, iwc);
        w.addIndexes(copy);
        copy.close();
        docCount = w.maxDoc();
        w.close();
        mulFactor *= 2;
    } while (docCount < 3000 * NUM_FILLER_DOCS);
    iwc = newIndexWriterConfig(new MockAnalyzer(random()));
    iwc.setMaxBufferedDocs(TestUtil.nextInt(random(), 50, 1000));
    // randomized codecs are sometimes too costly for this test:
    iwc.setCodec(Codec.forName("Lucene70"));
    RandomIndexWriter w = new RandomIndexWriter(random(), dir2, iwc);
    doc = new Document();
    doc.add(new Field("field2", "xxx", ft));
    for (int i = 0; i < NUM_EXTRA_DOCS / 2; i++) {
        w.addDocument(doc);
    }
    doc = new Document();
    doc.add(new Field("field2", "big bad bug", ft));
    for (int i = 0; i < NUM_EXTRA_DOCS / 2; i++) {
        w.addDocument(doc);
    }
    reader = w.getReader();
    bigSearcher = newSearcher(reader);
    w.close();
}
Also used : Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) ClassicSimilarity(org.apache.lucene.search.similarities.ClassicSimilarity) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) IndexWriter(org.apache.lucene.index.IndexWriter) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) Document(org.apache.lucene.document.Document) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig) FieldType(org.apache.lucene.document.FieldType) Directory(org.apache.lucene.store.Directory) BeforeClass(org.junit.BeforeClass)

Example 33 with ClassicSimilarity

use of org.apache.lucene.search.similarities.ClassicSimilarity in project lucene-solr by apache.

the class TestComplexExplanations method setUp.

@Override
public void setUp() throws Exception {
    super.setUp();
    // TODO: switch to BM25?
    searcher.setSimilarity(new ClassicSimilarity());
}
Also used : ClassicSimilarity(org.apache.lucene.search.similarities.ClassicSimilarity)

Example 34 with ClassicSimilarity

use of org.apache.lucene.search.similarities.ClassicSimilarity in project lucene-solr by apache.

the class TestMinShouldMatch2 method beforeClass.

@BeforeClass
public static void beforeClass() throws Exception {
    dir = newDirectory();
    RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
    final int numDocs = atLeast(300);
    for (int i = 0; i < numDocs; i++) {
        Document doc = new Document();
        addSome(doc, alwaysTerms);
        if (random().nextInt(100) < 90) {
            addSome(doc, commonTerms);
        }
        if (random().nextInt(100) < 50) {
            addSome(doc, mediumTerms);
        }
        if (random().nextInt(100) < 10) {
            addSome(doc, rareTerms);
        }
        iw.addDocument(doc);
    }
    iw.forceMerge(1);
    iw.close();
    r = DirectoryReader.open(dir);
    reader = getOnlyLeafReader(r);
    searcher = new IndexSearcher(reader);
    searcher.setSimilarity(new ClassicSimilarity());
}
Also used : ClassicSimilarity(org.apache.lucene.search.similarities.ClassicSimilarity) Document(org.apache.lucene.document.Document) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) BeforeClass(org.junit.BeforeClass)

Example 35 with ClassicSimilarity

use of org.apache.lucene.search.similarities.ClassicSimilarity in project lucene-solr by apache.

the class TestValueSources method testQuery.

public void testQuery() throws Exception {
    Similarity saved = searcher.getSimilarity(true);
    try {
        searcher.setSimilarity(new ClassicSimilarity());
        ValueSource vs = new QueryValueSource(new TermQuery(new Term("string", "bar")), 42F);
        assertHits(new FunctionQuery(vs), new float[] { 42F, 1.4054651F });
        // valuesource should exist only for things matching the term query
        // sanity check via quick & dirty wrapper arround tf
        ValueSource expected = new MultiFloatFunction(new ValueSource[] { new TFValueSource("bogus", "bogus", "string", new BytesRef("bar")) }) {

            @Override
            protected String name() {
                return "tf_based_exists";
            }

            @Override
            protected float func(int doc, FunctionValues[] valsArr) throws IOException {
                return valsArr[0].floatVal(doc);
            }

            @Override
            protected boolean exists(int doc, FunctionValues[] valsArr) throws IOException {
                // if tf > 0, then it should exist
                return 0 < func(doc, valsArr);
            }
        };
        assertExists(expected, vs);
        // Query matches all docs, func exists for all docs
        vs = new QueryValueSource(new TermQuery(new Term("text", "test")), 0F);
        assertAllExist(vs);
        // Query matches no docs, func exists for no docs
        vs = new QueryValueSource(new TermQuery(new Term("bogus", "does not exist")), 0F);
        assertNoneExist(vs);
    } finally {
        searcher.setSimilarity(saved);
    }
}
Also used : ClassicSimilarity(org.apache.lucene.search.similarities.ClassicSimilarity) TermQuery(org.apache.lucene.search.TermQuery) ClassicSimilarity(org.apache.lucene.search.similarities.ClassicSimilarity) Similarity(org.apache.lucene.search.similarities.Similarity) MultiFloatFunction(org.apache.lucene.queries.function.valuesource.MultiFloatFunction) SumTotalTermFreqValueSource(org.apache.lucene.queries.function.valuesource.SumTotalTermFreqValueSource) DoubleConstValueSource(org.apache.lucene.queries.function.valuesource.DoubleConstValueSource) ConstValueSource(org.apache.lucene.queries.function.valuesource.ConstValueSource) QueryValueSource(org.apache.lucene.queries.function.valuesource.QueryValueSource) DocFreqValueSource(org.apache.lucene.queries.function.valuesource.DocFreqValueSource) NormValueSource(org.apache.lucene.queries.function.valuesource.NormValueSource) NumDocsValueSource(org.apache.lucene.queries.function.valuesource.NumDocsValueSource) MaxDocValueSource(org.apache.lucene.queries.function.valuesource.MaxDocValueSource) JoinDocFreqValueSource(org.apache.lucene.queries.function.valuesource.JoinDocFreqValueSource) LiteralValueSource(org.apache.lucene.queries.function.valuesource.LiteralValueSource) TotalTermFreqValueSource(org.apache.lucene.queries.function.valuesource.TotalTermFreqValueSource) IDFValueSource(org.apache.lucene.queries.function.valuesource.IDFValueSource) TermFreqValueSource(org.apache.lucene.queries.function.valuesource.TermFreqValueSource) TFValueSource(org.apache.lucene.queries.function.valuesource.TFValueSource) TFValueSource(org.apache.lucene.queries.function.valuesource.TFValueSource) Term(org.apache.lucene.index.Term) QueryValueSource(org.apache.lucene.queries.function.valuesource.QueryValueSource) BytesRef(org.apache.lucene.util.BytesRef)

Aggregations

ClassicSimilarity (org.apache.lucene.search.similarities.ClassicSimilarity)43 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)14 Document (org.apache.lucene.document.Document)13 Term (org.apache.lucene.index.Term)12 Directory (org.apache.lucene.store.Directory)10 IndexReader (org.apache.lucene.index.IndexReader)9 Similarity (org.apache.lucene.search.similarities.Similarity)9 TermQuery (org.apache.lucene.search.TermQuery)7 BM25Similarity (org.apache.lucene.search.similarities.BM25Similarity)7 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)6 ConstValueSource (org.apache.lucene.queries.function.valuesource.ConstValueSource)5 DocFreqValueSource (org.apache.lucene.queries.function.valuesource.DocFreqValueSource)4 DoubleConstValueSource (org.apache.lucene.queries.function.valuesource.DoubleConstValueSource)4 IDFValueSource (org.apache.lucene.queries.function.valuesource.IDFValueSource)4 JoinDocFreqValueSource (org.apache.lucene.queries.function.valuesource.JoinDocFreqValueSource)4 LiteralValueSource (org.apache.lucene.queries.function.valuesource.LiteralValueSource)4 MaxDocValueSource (org.apache.lucene.queries.function.valuesource.MaxDocValueSource)4 IndexSearcher (org.apache.lucene.search.IndexSearcher)4 Query (org.apache.lucene.search.Query)4 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)3