Search in sources :

Example 16 with BM25Similarity

use of org.apache.lucene.search.similarities.BM25Similarity in project Anserini by castorini.

the class SimpleSearcher method setBM25.

/**
 * Specifies use of BM25 as the scoring function.
 *
 * @param k1 k1 parameter
 * @param b b parameter
 */
public void setBM25(float k1, float b) {
    this.similarity = new BM25Similarity(k1, b);
    // We need to re-initialize the searcher
    searcher = new IndexSearcher(reader);
    searcher.setSimilarity(similarity);
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity)

Example 17 with BM25Similarity

use of org.apache.lucene.search.similarities.BM25Similarity in project Anserini by castorini.

the class IndexReaderUtils method getBM25AnalyzedTermWeightWithParameters.

/**
 * Computes the BM25 weight of an analyzed term in a particular document.
 *
 * @param reader index reader
 * @param docid collection docid
 * @param term analyzed term
 * @param k1 k1 setting for BM25
 * @param b b setting for BM25
 * @return BM25 weight of the term in the specified document
 * @throws IOException if error encountered during query
 */
public static float getBM25AnalyzedTermWeightWithParameters(IndexReader reader, String docid, String term, float k1, float b) throws IOException {
    // We compute the BM25 score by issuing a single-term query with an additional filter clause that restricts
    // consideration to only the docid in question, and then returning the retrieval score.
    // 
    // This implementation is inefficient, but as the advantage of using the existing Lucene similarity, which means
    // that we don't need to copy the scoring function and keep it in sync wrt code updates.
    IndexSearcher searcher = new IndexSearcher(reader);
    searcher.setSimilarity(new BM25Similarity(k1, b));
    Query filterQuery = new ConstantScoreQuery(new TermQuery(new Term(IndexArgs.ID, docid)));
    Query termQuery = new TermQuery(new Term(IndexArgs.CONTENTS, term));
    BooleanQuery.Builder builder = new BooleanQuery.Builder();
    builder.add(filterQuery, BooleanClause.Occur.MUST);
    builder.add(termQuery, BooleanClause.Occur.MUST);
    Query finalQuery = builder.build();
    TopDocs rs = searcher.search(finalQuery, 1);
    // If we get zero results, indicates that term isn't found in the document.
    return rs.scoreDocs.length == 0 ? 0 : rs.scoreDocs[0].score - 1;
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) TopDocs(org.apache.lucene.search.TopDocs) TermQuery(org.apache.lucene.search.TermQuery) BooleanQuery(org.apache.lucene.search.BooleanQuery) Query(org.apache.lucene.search.Query) ConstantScoreQuery(org.apache.lucene.search.ConstantScoreQuery) TermQuery(org.apache.lucene.search.TermQuery) BooleanQuery(org.apache.lucene.search.BooleanQuery) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity) ConstantScoreQuery(org.apache.lucene.search.ConstantScoreQuery) Term(org.apache.lucene.index.Term)

Example 18 with BM25Similarity

use of org.apache.lucene.search.similarities.BM25Similarity in project Anserini by castorini.

the class IndexCollection method run.

public Counters run() throws IOException {
    final long start = System.nanoTime();
    LOG.info("============ Indexing Collection ============");
    int numThreads = args.threads;
    IndexWriter writer = null;
    // Used for LocalIndexThread
    if (indexPath != null) {
        final Directory dir = FSDirectory.open(indexPath);
        final CJKAnalyzer chineseAnalyzer = new CJKAnalyzer();
        final ArabicAnalyzer arabicAnalyzer = new ArabicAnalyzer();
        final BengaliAnalyzer bengaliAnalyzer = new BengaliAnalyzer();
        final DanishAnalyzer danishAnalyzer = new DanishAnalyzer();
        final DutchAnalyzer dutchAnalyzer = new DutchAnalyzer();
        final FinnishAnalyzer finnishAnalyzer = new FinnishAnalyzer();
        final FrenchAnalyzer frenchAnalyzer = new FrenchAnalyzer();
        final GermanAnalyzer germanAnalyzer = new GermanAnalyzer();
        final HindiAnalyzer hindiAnalyzer = new HindiAnalyzer();
        final HungarianAnalyzer hungarianAnalyzer = new HungarianAnalyzer();
        final IndonesianAnalyzer indonesianAnalyzer = new IndonesianAnalyzer();
        final ItalianAnalyzer italianAnalyzer = new ItalianAnalyzer();
        final JapaneseAnalyzer japaneseAnalyzer = new JapaneseAnalyzer();
        final NorwegianAnalyzer norwegianAnalyzer = new NorwegianAnalyzer();
        final PortugueseAnalyzer portugueseAnalyzer = new PortugueseAnalyzer();
        final RussianAnalyzer russianAnalyzer = new RussianAnalyzer();
        final SpanishAnalyzer spanishAnalyzer = new SpanishAnalyzer();
        final SwedishAnalyzer swedishAnalyzer = new SwedishAnalyzer();
        final ThaiAnalyzer thaiAnalyzer = new ThaiAnalyzer();
        final TurkishAnalyzer turkishAnalyzer = new TurkishAnalyzer();
        final WhitespaceAnalyzer whitespaceAnalyzer = new WhitespaceAnalyzer();
        final DefaultEnglishAnalyzer analyzer = DefaultEnglishAnalyzer.fromArguments(args.stemmer, args.keepStopwords, args.stopwords);
        final TweetAnalyzer tweetAnalyzer = new TweetAnalyzer(args.tweetStemming);
        final IndexWriterConfig config;
        if (args.collectionClass.equals("TweetCollection")) {
            config = new IndexWriterConfig(tweetAnalyzer);
        } else if (args.language.equals("ar")) {
            config = new IndexWriterConfig(arabicAnalyzer);
        } else if (args.language.equals("bn")) {
            config = new IndexWriterConfig(bengaliAnalyzer);
        } else if (args.language.equals("da")) {
            config = new IndexWriterConfig(danishAnalyzer);
        } else if (args.language.equals("de")) {
            config = new IndexWriterConfig(germanAnalyzer);
        } else if (args.language.equals("es")) {
            config = new IndexWriterConfig(spanishAnalyzer);
        } else if (args.language.equals("fi")) {
            config = new IndexWriterConfig(finnishAnalyzer);
        } else if (args.language.equals("fr")) {
            config = new IndexWriterConfig(frenchAnalyzer);
        } else if (args.language.equals("hi")) {
            config = new IndexWriterConfig(hindiAnalyzer);
        } else if (args.language.equals("hu")) {
            config = new IndexWriterConfig(hungarianAnalyzer);
        } else if (args.language.equals("id")) {
            config = new IndexWriterConfig(indonesianAnalyzer);
        } else if (args.language.equals("it")) {
            config = new IndexWriterConfig(italianAnalyzer);
        } else if (args.language.equals("ja")) {
            config = new IndexWriterConfig(japaneseAnalyzer);
        } else if (args.language.equals("nl")) {
            config = new IndexWriterConfig(dutchAnalyzer);
        } else if (args.language.equals("no")) {
            config = new IndexWriterConfig(norwegianAnalyzer);
        } else if (args.language.equals("pt")) {
            config = new IndexWriterConfig(portugueseAnalyzer);
        } else if (args.language.equals("ru")) {
            config = new IndexWriterConfig(russianAnalyzer);
        } else if (args.language.equals("sv")) {
            config = new IndexWriterConfig(swedishAnalyzer);
        } else if (args.language.equals("th")) {
            config = new IndexWriterConfig(thaiAnalyzer);
        } else if (args.language.equals("tr")) {
            config = new IndexWriterConfig(turkishAnalyzer);
        } else if (args.language.equals("zh") || args.language.equals("ko")) {
            config = new IndexWriterConfig(chineseAnalyzer);
        } else if (args.language.equals("sw") || args.language.equals("te")) {
            // For Mr.TyDi: sw and te do not have custom Lucene analyzers, so just use whitespace analyzer.
            config = new IndexWriterConfig(whitespaceAnalyzer);
        } else if (args.pretokenized) {
            config = new IndexWriterConfig(whitespaceAnalyzer);
        } else {
            config = new IndexWriterConfig(analyzer);
        }
        if (args.bm25Accurate) {
            // necessary during indexing as the norm used in BM25 is already determined at index time.
            config.setSimilarity(new AccurateBM25Similarity());
        }
        if (args.impact) {
            config.setSimilarity(new ImpactSimilarity());
        } else {
            config.setSimilarity(new BM25Similarity());
        }
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
        config.setRAMBufferSizeMB(args.memorybufferSize);
        config.setUseCompoundFile(false);
        config.setMergeScheduler(new ConcurrentMergeScheduler());
        writer = new IndexWriter(dir, config);
    }
    final ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(numThreads);
    LOG.info("Thread pool with " + numThreads + " threads initialized.");
    LOG.info("Initializing collection in " + collectionPath.toString());
    List<?> segmentPaths = collection.getSegmentPaths();
    // when we want sharding to be done
    if (args.shardCount > 1) {
        segmentPaths = collection.getSegmentPaths(args.shardCount, args.shardCurrent);
    }
    final int segmentCnt = segmentPaths.size();
    LOG.info(String.format("%,d %s found", segmentCnt, (segmentCnt == 1 ? "file" : "files")));
    LOG.info("Starting to index...");
    for (int i = 0; i < segmentCnt; i++) {
        if (args.solr) {
            executor.execute(new SolrIndexerThread(collection, (Path) segmentPaths.get(i)));
        } else if (args.es) {
            executor.execute(new ESIndexerThread(collection, (Path) segmentPaths.get(i)));
        } else {
            executor.execute(new LocalIndexerThread(writer, collection, (Path) segmentPaths.get(i)));
        }
    }
    executor.shutdown();
    try {
        // Wait for existing tasks to terminate
        while (!executor.awaitTermination(1, TimeUnit.MINUTES)) {
            if (segmentCnt == 1) {
                LOG.info(String.format("%,d documents indexed", counters.indexed.get()));
            } else {
                LOG.info(String.format("%.2f%% of files completed, %,d documents indexed", (double) executor.getCompletedTaskCount() / segmentCnt * 100.0d, counters.indexed.get()));
            }
        }
    } catch (InterruptedException ie) {
        // (Re-)Cancel if current thread also interrupted
        executor.shutdownNow();
        // Preserve interrupt status
        Thread.currentThread().interrupt();
    }
    if (segmentCnt != executor.getCompletedTaskCount()) {
        throw new RuntimeException("totalFiles = " + segmentCnt + " is not equal to completedTaskCount =  " + executor.getCompletedTaskCount());
    }
    long numIndexed;
    if (args.solr || args.es) {
        numIndexed = counters.indexed.get();
    } else {
        numIndexed = writer.getDocStats().maxDoc;
    }
    // Do a final commit
    if (args.solr) {
        try {
            SolrClient client = solrPool.borrowObject();
            client.commit(args.solrIndex);
            // Needed for orderly shutdown so the SolrClient executor does not delay main thread exit
            solrPool.returnObject(client);
            solrPool.close();
        } catch (Exception e) {
            LOG.error("Exception during final Solr commit: ", e);
        }
    }
    if (args.es) {
        esPool.close();
    }
    try {
        if (writer != null) {
            writer.commit();
            if (args.optimize) {
                writer.forceMerge(1);
            }
        }
    } finally {
        try {
            if (writer != null) {
                writer.close();
            }
        } catch (IOException e) {
            // It is possible that this happens... but nothing much we can do at this point,
            // so just log the error and move on.
            LOG.error(e);
        }
    }
    if (numIndexed != counters.indexed.get()) {
        LOG.warn("Unexpected difference between number of indexed documents and index maxDoc.");
    }
    LOG.info(String.format("Indexing Complete! %,d documents indexed", numIndexed));
    LOG.info("============ Final Counter Values ============");
    LOG.info(String.format("indexed:     %,12d", counters.indexed.get()));
    LOG.info(String.format("unindexable: %,12d", counters.unindexable.get()));
    LOG.info(String.format("empty:       %,12d", counters.empty.get()));
    LOG.info(String.format("skipped:     %,12d", counters.skipped.get()));
    LOG.info(String.format("errors:      %,12d", counters.errors.get()));
    final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS);
    LOG.info(String.format("Total %,d documents indexed in %s", numIndexed, DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss")));
    return counters;
}
Also used : IndonesianAnalyzer(org.apache.lucene.analysis.id.IndonesianAnalyzer) TweetAnalyzer(io.anserini.analysis.TweetAnalyzer) JapaneseAnalyzer(org.apache.lucene.analysis.ja.JapaneseAnalyzer) DefaultEnglishAnalyzer(io.anserini.analysis.DefaultEnglishAnalyzer) ConcurrentMergeScheduler(org.apache.lucene.index.ConcurrentMergeScheduler) DutchAnalyzer(org.apache.lucene.analysis.nl.DutchAnalyzer) SpanishAnalyzer(org.apache.lucene.analysis.es.SpanishAnalyzer) AccurateBM25Similarity(io.anserini.search.similarity.AccurateBM25Similarity) CloudSolrClient(org.apache.solr.client.solrj.impl.CloudSolrClient) SolrClient(org.apache.solr.client.solrj.SolrClient) FrenchAnalyzer(org.apache.lucene.analysis.fr.FrenchAnalyzer) RussianAnalyzer(org.apache.lucene.analysis.ru.RussianAnalyzer) NorwegianAnalyzer(org.apache.lucene.analysis.no.NorwegianAnalyzer) GermanAnalyzer(org.apache.lucene.analysis.de.GermanAnalyzer) ItalianAnalyzer(org.apache.lucene.analysis.it.ItalianAnalyzer) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) BengaliAnalyzer(org.apache.lucene.analysis.bn.BengaliAnalyzer) ThaiAnalyzer(org.apache.lucene.analysis.th.ThaiAnalyzer) WhitespaceAnalyzer(org.apache.lucene.analysis.core.WhitespaceAnalyzer) Path(java.nio.file.Path) ImpactSimilarity(io.anserini.search.similarity.ImpactSimilarity) CJKAnalyzer(org.apache.lucene.analysis.cjk.CJKAnalyzer) IOException(java.io.IOException) HungarianAnalyzer(org.apache.lucene.analysis.hu.HungarianAnalyzer) InvalidDocumentException(io.anserini.index.generator.InvalidDocumentException) CmdLineException(org.kohsuke.args4j.CmdLineException) SkippedDocumentException(io.anserini.index.generator.SkippedDocumentException) IOException(java.io.IOException) EmptyDocumentException(io.anserini.index.generator.EmptyDocumentException) ArabicAnalyzer(org.apache.lucene.analysis.ar.ArabicAnalyzer) SwedishAnalyzer(org.apache.lucene.analysis.sv.SwedishAnalyzer) IndexWriter(org.apache.lucene.index.IndexWriter) DanishAnalyzer(org.apache.lucene.analysis.da.DanishAnalyzer) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity) AccurateBM25Similarity(io.anserini.search.similarity.AccurateBM25Similarity) FinnishAnalyzer(org.apache.lucene.analysis.fi.FinnishAnalyzer) ThreadPoolExecutor(java.util.concurrent.ThreadPoolExecutor) PortugueseAnalyzer(org.apache.lucene.analysis.pt.PortugueseAnalyzer) TurkishAnalyzer(org.apache.lucene.analysis.tr.TurkishAnalyzer) HindiAnalyzer(org.apache.lucene.analysis.hi.HindiAnalyzer) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 19 with BM25Similarity

use of org.apache.lucene.search.similarities.BM25Similarity in project Anserini by castorini.

the class BasicIndexOperationsTest method testIterateThroughDocumentVectorComputeBM25.

// This test case iterates through all documents in the index and prints out the document vector:
// For each term, we print out the term frequency and the BM25 weight.
@Test
public void testIterateThroughDocumentVectorComputeBM25() throws Exception {
    Directory dir = FSDirectory.open(tempDir1);
    IndexReader reader = DirectoryReader.open(dir);
    IndexSearcher searcher = new IndexSearcher(reader);
    searcher.setSimilarity(new BM25Similarity());
    int numDocs = reader.numDocs();
    // Iterate through the document vectors
    for (int i = 0; i < numDocs; i++) {
        String docid = reader.document(i).getField("id").stringValue();
        System.out.println(reader.document(i));
        System.out.println(i + ": " + docid);
        Terms terms = reader.getTermVector(i, "contents");
        TermsEnum te = terms.iterator();
        // For this document, iterate through the terms.
        while (te.next() != null) {
            String term = new Term("contents", te.term()).bytes().utf8ToString();
            long tf = te.totalTermFreq();
            // The way to compute the BM25 score is to issue a query with the exact docid and the
            // term in question, and look at the retrieval score.
            // the docid
            Query filterQuery = new ConstantScoreQuery(new TermQuery(new Term("id", docid)));
            // the term
            Query termQuery = new TermQuery(new Term("contents", term));
            // must have both
            BooleanQuery.Builder builder = new BooleanQuery.Builder();
            builder.add(filterQuery, BooleanClause.Occur.MUST);
            builder.add(termQuery, BooleanClause.Occur.MUST);
            Query finalQuery = builder.build();
            // issue the query
            TopDocs rs = searcher.search(finalQuery, 1);
            // The BM25 weight is the maxScore
            System.out.println(term + " " + tf + " " + (rs.scoreDocs.length == 0 ? Float.NaN : rs.scoreDocs[0].score - 1));
        }
    }
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) TermQuery(org.apache.lucene.search.TermQuery) BooleanQuery(org.apache.lucene.search.BooleanQuery) Query(org.apache.lucene.search.Query) ConstantScoreQuery(org.apache.lucene.search.ConstantScoreQuery) TermQuery(org.apache.lucene.search.TermQuery) BooleanQuery(org.apache.lucene.search.BooleanQuery) Terms(org.apache.lucene.index.Terms) MultiTerms(org.apache.lucene.index.MultiTerms) Term(org.apache.lucene.index.Term) TermsEnum(org.apache.lucene.index.TermsEnum) TopDocs(org.apache.lucene.search.TopDocs) IndexReader(org.apache.lucene.index.IndexReader) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity) ConstantScoreQuery(org.apache.lucene.search.ConstantScoreQuery) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) Test(org.junit.Test)

Example 20 with BM25Similarity

use of org.apache.lucene.search.similarities.BM25Similarity in project elasticsearch by elastic.

the class SimilarityTests method testResolveSimilaritiesFromMapping_bm25.

public void testResolveSimilaritiesFromMapping_bm25() throws IOException {
    String mapping = XContentFactory.jsonBuilder().startObject().startObject("type").startObject("properties").startObject("field1").field("type", "text").field("similarity", "my_similarity").endObject().endObject().endObject().endObject().string();
    Settings indexSettings = Settings.builder().put("index.similarity.my_similarity.type", "BM25").put("index.similarity.my_similarity.k1", 2.0f).put("index.similarity.my_similarity.b", 0.5f).put("index.similarity.my_similarity.discount_overlaps", false).build();
    IndexService indexService = createIndex("foo", indexSettings);
    DocumentMapper documentMapper = indexService.mapperService().documentMapperParser().parse("type", new CompressedXContent(mapping));
    assertThat(documentMapper.mappers().getMapper("field1").fieldType().similarity(), instanceOf(BM25SimilarityProvider.class));
    BM25Similarity similarity = (BM25Similarity) documentMapper.mappers().getMapper("field1").fieldType().similarity().get();
    assertThat(similarity.getK1(), equalTo(2.0f));
    assertThat(similarity.getB(), equalTo(0.5f));
    assertThat(similarity.getDiscountOverlaps(), equalTo(false));
}
Also used : IndexService(org.elasticsearch.index.IndexService) DocumentMapper(org.elasticsearch.index.mapper.DocumentMapper) CompressedXContent(org.elasticsearch.common.compress.CompressedXContent) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity) Settings(org.elasticsearch.common.settings.Settings)

Aggregations

BM25Similarity (org.apache.lucene.search.similarities.BM25Similarity)29 Directory (org.apache.lucene.store.Directory)12 IndexSearcher (org.apache.lucene.search.IndexSearcher)11 IndexReader (org.apache.lucene.index.IndexReader)10 Similarity (org.apache.lucene.search.similarities.Similarity)9 FSDirectory (org.apache.lucene.store.FSDirectory)9 Query (org.apache.lucene.search.Query)8 TopDocs (org.apache.lucene.search.TopDocs)8 TermQuery (org.apache.lucene.search.TermQuery)7 ClassicSimilarity (org.apache.lucene.search.similarities.ClassicSimilarity)7 Test (org.junit.Test)7 Term (org.apache.lucene.index.Term)6 RerankerCascade (io.anserini.rerank.RerankerCascade)5 BooleanQuery (org.apache.lucene.search.BooleanQuery)5 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)4 FeatureExtractors (io.anserini.ltr.feature.FeatureExtractors)3 IdentityReranker (io.anserini.rerank.IdentityReranker)3 ScoredDocuments (io.anserini.rerank.ScoredDocuments)3 Qrels (io.anserini.util.Qrels)3 PrintStream (java.io.PrintStream)3