Search in sources :

Example 41 with ClassicSimilarity

use of org.apache.lucene.search.similarities.ClassicSimilarity in project lucene-solr by apache.

the class TestQueryRescorer method testNullScorerTermQuery.

// Test LUCENE-5682
public void testNullScorerTermQuery() throws Exception {
    Directory dir = newDirectory();
    RandomIndexWriter w = new RandomIndexWriter(random(), dir, newIndexWriterConfig());
    Document doc = new Document();
    doc.add(newStringField("id", "0", Field.Store.YES));
    doc.add(newTextField("field", "wizard the the the the the oz", Field.Store.NO));
    w.addDocument(doc);
    doc = new Document();
    doc.add(newStringField("id", "1", Field.Store.YES));
    // 1 extra token, but wizard and oz are close;
    doc.add(newTextField("field", "wizard oz the the the the the the", Field.Store.NO));
    w.addDocument(doc);
    IndexReader r = w.getReader();
    w.close();
    // Do ordinary BooleanQuery:
    BooleanQuery.Builder bq = new BooleanQuery.Builder();
    bq.add(new TermQuery(new Term("field", "wizard")), Occur.SHOULD);
    bq.add(new TermQuery(new Term("field", "oz")), Occur.SHOULD);
    IndexSearcher searcher = getSearcher(r);
    searcher.setSimilarity(new ClassicSimilarity());
    TopDocs hits = searcher.search(bq.build(), 10);
    assertEquals(2, hits.totalHits);
    assertEquals("0", searcher.doc(hits.scoreDocs[0].doc).get("id"));
    assertEquals("1", searcher.doc(hits.scoreDocs[1].doc).get("id"));
    // Now, resort using TermQuery on term that does not exist.
    TermQuery tq = new TermQuery(new Term("field", "gold"));
    TopDocs hits2 = QueryRescorer.rescore(searcher, hits, tq, 2.0, 10);
    // Just testing that null scorer is handled.
    assertEquals(2, hits2.totalHits);
    r.close();
    dir.close();
}
Also used : SpanTermQuery(org.apache.lucene.search.spans.SpanTermQuery) ClassicSimilarity(org.apache.lucene.search.similarities.ClassicSimilarity) IndexReader(org.apache.lucene.index.IndexReader) Term(org.apache.lucene.index.Term) Document(org.apache.lucene.document.Document) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) Directory(org.apache.lucene.store.Directory)

Example 42 with ClassicSimilarity

use of org.apache.lucene.search.similarities.ClassicSimilarity in project lucene-solr by apache.

the class ClassicSimilarityFactory method getSimilarity.

@Override
public Similarity getSimilarity() {
    ClassicSimilarity sim = new ClassicSimilarity();
    sim.setDiscountOverlaps(discountOverlaps);
    return sim;
}
Also used : ClassicSimilarity(org.apache.lucene.search.similarities.ClassicSimilarity)

Example 43 with ClassicSimilarity

use of org.apache.lucene.search.similarities.ClassicSimilarity in project Anserini by castorini.

the class IdfPassageScorer method score.

@Override
public void score(String query, Map<String, Float> sentences) throws Exception {
    //    EnglishAnalyzer ea = new EnglishAnalyzer(StopFilter.makeStopSet(stopWords));
    EnglishAnalyzer ea = new EnglishAnalyzer(CharArraySet.EMPTY_SET);
    QueryParser qp = new QueryParser(LuceneDocumentGenerator.FIELD_BODY, ea);
    ClassicSimilarity similarity = new ClassicSimilarity();
    String escapedQuery = qp.escape(query);
    Query question = qp.parse(escapedQuery);
    HashSet<String> questionTerms = new HashSet<>(Arrays.asList(question.toString().trim().toLowerCase().split("\\s+")));
    // add the question terms to the termIDF Map
    for (String questionTerm : questionTerms) {
        try {
            TermQuery q = (TermQuery) qp.parse(questionTerm);
            Term t = q.getTerm();
            double termIDF = similarity.idf(reader.docFreq(t), reader.numDocs());
            termIdfMap.put(questionTerm, String.valueOf(termIDF));
        } catch (Exception e) {
            continue;
        }
    }
    // avoid duplicate passages
    HashSet<String> seenSentences = new HashSet<>();
    for (Map.Entry<String, Float> sent : sentences.entrySet()) {
        double idf = 0.0;
        HashSet<String> seenTerms = new HashSet<>();
        String[] terms = sent.getKey().toLowerCase().split("\\s+");
        for (String term : terms) {
            try {
                TermQuery q = (TermQuery) qp.parse(term);
                Term t = q.getTerm();
                double termIDF = similarity.idf(reader.docFreq(t), reader.numDocs());
                termIdfMap.put(term, String.valueOf(termIDF));
                if (questionTerms.contains(t.toString()) && !seenTerms.contains(t.toString())) {
                    idf += termIDF;
                    seenTerms.add(t.toString());
                } else {
                    idf += 0.0;
                }
            } catch (Exception e) {
                continue;
            }
        }
        double weightedScore = idf + 0.0001 * sent.getValue();
        ScoredPassage scoredPassage = new ScoredPassage(sent.getKey(), weightedScore, sent.getValue());
        if ((scoredPassageHeap.size() < topPassages || weightedScore > scoredPassageHeap.peekLast().getScore()) && !seenSentences.contains(sent)) {
            if (scoredPassageHeap.size() == topPassages) {
                scoredPassageHeap.pollLast();
            }
            scoredPassageHeap.add(scoredPassage);
            seenSentences.add(sent.getKey());
        }
    }
}
Also used : ClassicSimilarity(org.apache.lucene.search.similarities.ClassicSimilarity) TermQuery(org.apache.lucene.search.TermQuery) Query(org.apache.lucene.search.Query) TermQuery(org.apache.lucene.search.TermQuery) EnglishAnalyzer(org.apache.lucene.analysis.en.EnglishAnalyzer) Term(org.apache.lucene.index.Term) QueryParser(org.apache.lucene.queryparser.classic.QueryParser)

Aggregations

ClassicSimilarity (org.apache.lucene.search.similarities.ClassicSimilarity)43 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)14 Document (org.apache.lucene.document.Document)13 Term (org.apache.lucene.index.Term)12 Directory (org.apache.lucene.store.Directory)10 IndexReader (org.apache.lucene.index.IndexReader)9 Similarity (org.apache.lucene.search.similarities.Similarity)9 TermQuery (org.apache.lucene.search.TermQuery)7 BM25Similarity (org.apache.lucene.search.similarities.BM25Similarity)7 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)6 ConstValueSource (org.apache.lucene.queries.function.valuesource.ConstValueSource)5 DocFreqValueSource (org.apache.lucene.queries.function.valuesource.DocFreqValueSource)4 DoubleConstValueSource (org.apache.lucene.queries.function.valuesource.DoubleConstValueSource)4 IDFValueSource (org.apache.lucene.queries.function.valuesource.IDFValueSource)4 JoinDocFreqValueSource (org.apache.lucene.queries.function.valuesource.JoinDocFreqValueSource)4 LiteralValueSource (org.apache.lucene.queries.function.valuesource.LiteralValueSource)4 MaxDocValueSource (org.apache.lucene.queries.function.valuesource.MaxDocValueSource)4 IndexSearcher (org.apache.lucene.search.IndexSearcher)4 Query (org.apache.lucene.search.Query)4 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)3