Search in sources :

Example 21 with BM25Similarity

use of org.apache.lucene.search.similarities.BM25Similarity in project elasticsearch by elastic.

the class BlendedTermQueryTests method setSimilarity.

public IndexSearcher setSimilarity(IndexSearcher searcher) {
    Similarity similarity = random().nextBoolean() ? new BM25Similarity() : new ClassicSimilarity();
    searcher.setSimilarity(similarity);
    return searcher;
}
Also used : ClassicSimilarity(org.apache.lucene.search.similarities.ClassicSimilarity) Similarity(org.apache.lucene.search.similarities.Similarity) ClassicSimilarity(org.apache.lucene.search.similarities.ClassicSimilarity) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity)

Example 22 with BM25Similarity

use of org.apache.lucene.search.similarities.BM25Similarity in project lucene-solr by apache.

the class TestMemoryIndex method testFreezeAPI.

@Test
public void testFreezeAPI() {
    MemoryIndex mi = new MemoryIndex();
    mi.addField("f1", "some text", analyzer);
    assertThat(mi.search(new MatchAllDocsQuery()), not(is(0.0f)));
    assertThat(mi.search(new TermQuery(new Term("f1", "some"))), not(is(0.0f)));
    // check we can add a new field after searching
    mi.addField("f2", "some more text", analyzer);
    assertThat(mi.search(new TermQuery(new Term("f2", "some"))), not(is(0.0f)));
    // freeze!
    mi.freeze();
    RuntimeException expected = expectThrows(RuntimeException.class, () -> {
        mi.addField("f3", "and yet more", analyzer);
    });
    assertThat(expected.getMessage(), containsString("frozen"));
    expected = expectThrows(RuntimeException.class, () -> {
        mi.setSimilarity(new BM25Similarity(1, 1));
    });
    assertThat(expected.getMessage(), containsString("frozen"));
    assertThat(mi.search(new TermQuery(new Term("f1", "some"))), not(is(0.0f)));
    mi.reset();
    mi.addField("f1", "wibble", analyzer);
    assertThat(mi.search(new TermQuery(new Term("f1", "some"))), is(0.0f));
    assertThat(mi.search(new TermQuery(new Term("f1", "wibble"))), not(is(0.0f)));
    // check we can set the Similarity again
    mi.setSimilarity(new ClassicSimilarity());
}
Also used : TermQuery(org.apache.lucene.search.TermQuery) ClassicSimilarity(org.apache.lucene.search.similarities.ClassicSimilarity) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity) Term(org.apache.lucene.index.Term) MatchAllDocsQuery(org.apache.lucene.search.MatchAllDocsQuery) Test(org.junit.Test)

Example 23 with BM25Similarity

use of org.apache.lucene.search.similarities.BM25Similarity in project Anserini by castorini.

the class DumpTweetsLtrData method main.

public static void main(String[] argv) throws Exception {
    long curTime = System.nanoTime();
    LtrArgs args = new LtrArgs();
    CmdLineParser parser = new CmdLineParser(args, ParserProperties.defaults().withUsageWidth(90));
    try {
        parser.parseArgument(argv);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        parser.printUsage(System.err);
        System.err.println("Example: DumpTweetsLtrData" + parser.printExample(OptionHandlerFilter.REQUIRED));
        return;
    }
    LOG.info("Reading index at " + args.index);
    Directory dir = FSDirectory.open(Paths.get(args.index));
    IndexReader reader = DirectoryReader.open(dir);
    IndexSearcher searcher = new IndexSearcher(reader);
    if (args.ql) {
        LOG.info("Using QL scoring model");
        searcher.setSimilarity(new LMDirichletSimilarity(args.mu));
    } else if (args.bm25) {
        LOG.info("Using BM25 scoring model");
        searcher.setSimilarity(new BM25Similarity(args.k1, args.b));
    } else {
        LOG.error("Error: Must specify scoring model!");
        System.exit(-1);
    }
    Qrels qrels = new Qrels(args.qrels);
    FeatureExtractors extractors = null;
    if (args.extractors != null) {
        extractors = FeatureExtractors.loadExtractor(args.extractors);
    }
    PrintStream out = new PrintStream(new FileOutputStream(new File(args.output)));
    RerankerCascade cascade = new RerankerCascade();
    cascade.add(new RemoveRetweetsTemporalTiebreakReranker());
    cascade.add(new TweetsLtrDataGenerator(out, qrels, extractors));
    MicroblogTopicSet topics = MicroblogTopicSet.fromFile(new File(args.topics));
    LOG.info("Initialized complete! (elapsed time = " + (System.nanoTime() - curTime) / 1000000 + "ms)");
    long totalTime = 0;
    int cnt = 0;
    for (MicroblogTopic topic : topics) {
        long curQueryTime = System.nanoTime();
        Query filter = LongPoint.newRangeQuery(StatusField.ID.name, 0L, topic.getQueryTweetTime());
        Query query = AnalyzerUtils.buildBagOfWordsQuery(StatusField.TEXT.name, IndexTweets.ANALYZER, topic.getQuery());
        BooleanQuery.Builder builder = new BooleanQuery.Builder();
        builder.add(filter, BooleanClause.Occur.FILTER);
        builder.add(query, BooleanClause.Occur.MUST);
        Query q = builder.build();
        TopDocs rs = searcher.search(q, args.hits);
        List<String> queryTokens = AnalyzerUtils.tokenize(IndexTweets.ANALYZER, topic.getQuery());
        RerankerContext context = new RerankerContext(searcher, query, topic.getId(), topic.getQuery(), queryTokens, StatusField.TEXT.name, filter);
        cascade.run(ScoredDocuments.fromTopDocs(rs, searcher), context);
        long qtime = (System.nanoTime() - curQueryTime) / 1000000;
        LOG.info("Query " + topic.getId() + " (elapsed time = " + qtime + "ms)");
        totalTime += qtime;
        cnt++;
    }
    LOG.info("All queries completed!");
    LOG.info("Total elapsed time = " + totalTime + "ms");
    LOG.info("Average query latency = " + (totalTime / cnt) + "ms");
    reader.close();
    out.close();
}
Also used : RemoveRetweetsTemporalTiebreakReranker(io.anserini.rerank.twitter.RemoveRetweetsTemporalTiebreakReranker) RerankerCascade(io.anserini.rerank.RerankerCascade) MicroblogTopicSet(io.anserini.search.MicroblogTopicSet) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) Qrels(io.anserini.util.Qrels) PrintStream(java.io.PrintStream) LongPoint(org.apache.lucene.document.LongPoint) FeatureExtractors(io.anserini.ltr.feature.FeatureExtractors) FileOutputStream(java.io.FileOutputStream) IndexReader(org.apache.lucene.index.IndexReader) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity) MicroblogTopic(io.anserini.search.MicroblogTopic) LMDirichletSimilarity(org.apache.lucene.search.similarities.LMDirichletSimilarity) File(java.io.File) RerankerContext(io.anserini.rerank.RerankerContext)

Example 24 with BM25Similarity

use of org.apache.lucene.search.similarities.BM25Similarity in project Anserini by castorini.

the class RetrieveSentences method search.

public Map<String, Float> search(SortedMap<Integer, String> topics, int numHits) throws IOException, ParseException {
    IndexSearcher searcher = new IndexSearcher(reader);
    // using BM25 scoring model
    Similarity similarity = new BM25Similarity(0.9f, 0.4f);
    searcher.setSimilarity(similarity);
    EnglishAnalyzer ea = new EnglishAnalyzer();
    QueryParser queryParser = new QueryParser(FIELD_BODY, ea);
    queryParser.setDefaultOperator(QueryParser.Operator.OR);
    Map<String, Float> scoredDocs = new LinkedHashMap<>();
    for (Map.Entry<Integer, String> entry : topics.entrySet()) {
        int qID = entry.getKey();
        String queryString = entry.getValue();
        Query query = AnalyzerUtils.buildBagOfWordsQuery(FIELD_BODY, ea, queryString);
        TopDocs rs = searcher.search(query, numHits);
        ScoreDoc[] hits = rs.scoreDocs;
        ScoredDocuments docs = ScoredDocuments.fromTopDocs(rs, searcher);
        for (int i = 0; i < docs.documents.length; i++) {
            scoredDocs.put(docs.documents[i].getField(FIELD_ID).stringValue(), docs.scores[i]);
        }
    }
    return scoredDocs;
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) Similarity(org.apache.lucene.search.similarities.Similarity) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity) Query(org.apache.lucene.search.Query) ScoredDocuments(io.anserini.rerank.ScoredDocuments) EnglishAnalyzer(org.apache.lucene.analysis.en.EnglishAnalyzer) ScoreDoc(org.apache.lucene.search.ScoreDoc) TopDocs(org.apache.lucene.search.TopDocs) QueryParser(org.apache.lucene.queryparser.classic.QueryParser) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity)

Example 25 with BM25Similarity

use of org.apache.lucene.search.similarities.BM25Similarity in project Anserini by castorini.

the class SdmQueryTest method buildTestIndex.

// A very simple example of how to build an index.
private void buildTestIndex() throws IOException {
    Directory dir = FSDirectory.open(tempDir1);
    IndexWriterConfig config = new IndexWriterConfig(analyzer);
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    config.setSimilarity(new BM25Similarity());
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    IndexWriter writer = new IndexWriter(dir, config);
    FieldType textOptions = new FieldType();
    textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
    textOptions.setStored(true);
    textOptions.setTokenized(true);
    Document doc1 = new Document();
    doc1.add(new Field(field, "john fox information river chicken bush frank retrieval world", textOptions));
    writer.addDocument(doc1);
    writer.commit();
    writer.forceMerge(1);
    writer.close();
}
Also used : Field(org.apache.lucene.document.Field) IndexWriter(org.apache.lucene.index.IndexWriter) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity) Document(org.apache.lucene.document.Document) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig) FieldType(org.apache.lucene.document.FieldType)

Aggregations

BM25Similarity (org.apache.lucene.search.similarities.BM25Similarity)29 Directory (org.apache.lucene.store.Directory)12 IndexSearcher (org.apache.lucene.search.IndexSearcher)11 IndexReader (org.apache.lucene.index.IndexReader)10 Similarity (org.apache.lucene.search.similarities.Similarity)9 FSDirectory (org.apache.lucene.store.FSDirectory)9 Query (org.apache.lucene.search.Query)8 TopDocs (org.apache.lucene.search.TopDocs)8 TermQuery (org.apache.lucene.search.TermQuery)7 ClassicSimilarity (org.apache.lucene.search.similarities.ClassicSimilarity)7 Test (org.junit.Test)7 Term (org.apache.lucene.index.Term)6 RerankerCascade (io.anserini.rerank.RerankerCascade)5 BooleanQuery (org.apache.lucene.search.BooleanQuery)5 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)4 FeatureExtractors (io.anserini.ltr.feature.FeatureExtractors)3 IdentityReranker (io.anserini.rerank.IdentityReranker)3 ScoredDocuments (io.anserini.rerank.ScoredDocuments)3 Qrels (io.anserini.util.Qrels)3 PrintStream (java.io.PrintStream)3