Search in sources :

Example 21 with Similarity

use of org.apache.lucene.search.similarities.Similarity in project Anserini by castorini.

the class PyseriniEntryPoint method search.

public Map<String, Float> search(String query, int numHits) throws IOException, ParseException {
    // for now, using BM25 similarity  - not branching on args.bm25 or args.ql
    float k1 = 0.9f;
    float b = 0.4f;
    Similarity similarity = new BM25Similarity(k1, b);
    // for now, creating Topics map and appending query and setting id=1
    SortedMap<Integer, String> topics = new TreeMap<>();
    int id = 1;
    topics.put(id, query);
    // for now, using IdentityReranker  - not branching on args.rm3
    RerankerCascade cascade = new RerankerCascade();
    cascade.add(new IdentityReranker());
    Map<String, Float> scoredDocs = search(topics, similarity, numHits, cascade, false, false);
    return scoredDocs;
}
Also used : RerankerCascade(io.anserini.rerank.RerankerCascade) Similarity(org.apache.lucene.search.similarities.Similarity) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity) IdentityReranker(io.anserini.rerank.IdentityReranker) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity)

Example 22 with Similarity

use of org.apache.lucene.search.similarities.Similarity in project Anserini by castorini.

the class LookupTopic method search.

/**
   * Prints query results to the standard output stream.
   *
   * @param queryName the entity name to search
   * @throws Exception on error
   */
public void search(String queryName) throws Exception {
    LOG.info("Querying started...");
    // Initialize index searcher
    IndexSearcher searcher = new IndexSearcher(reader);
    SimpleAnalyzer analyzer = new SimpleAnalyzer();
    int numHits = 20;
    // find exact title
    QueryParser titleParser = new QueryParser(TopicLuceneDocumentGenerator.FIELD_TITLE, analyzer);
    Query titleQuery = titleParser.parse(queryName);
    TopDocs rs = searcher.search(titleQuery, numHits);
    ScoredDocuments docs = ScoredDocuments.fromTopDocs(rs, searcher);
    for (int i = 0; i < docs.documents.length; i++) {
        String resultDoc = String.format("%d - SCORE: %f\nTOPIC_MID: %s\nWIKI_TITLE: %s\nW3_LABEL: %s\n\n", (i + 1), docs.scores[i], docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TOPIC_MID).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TITLE).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_LABEL).stringValue());
        System.out.println(resultDoc);
    }
    if (docs.documents.length != 0) {
        System.out.println("Exact WIKI_TITLE found! Ending search.");
        return;
    } else {
        System.out.println("Exact WIKI_TITLE not found. Searching for the label...");
    }
    System.out.println();
    // find exact label
    QueryParser labelParser = new QueryParser(TopicLuceneDocumentGenerator.FIELD_LABEL, analyzer);
    Query labelQuery = labelParser.parse(queryName);
    rs = searcher.search(labelQuery, numHits);
    docs = ScoredDocuments.fromTopDocs(rs, searcher);
    for (int i = 0; i < docs.documents.length; i++) {
        String resultDoc = String.format("%d - SCORE: %f\nTOPIC_MID: %s\nWIKI_TITLE: %s\nW3_LABEL: %s\n\n", (i + 1), docs.scores[i], docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TOPIC_MID).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TITLE).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_LABEL).stringValue());
        System.out.println(resultDoc);
    }
    if (docs.documents.length != 0) {
        System.out.println("Exact W3_LABEL found! Ending search.");
        return;
    } else {
        System.out.println("Exact W3_LABEL not found. Ranking the topics using BM25 according the text/title/label...");
    }
    System.out.println();
    float k1 = 1.5f;
    float b = 0.75f;
    Similarity similarity = new BM25Similarity(k1, b);
    searcher.setSimilarity(similarity);
    MultiFieldQueryParser queryParser = new MultiFieldQueryParser(new String[] { TopicLuceneDocumentGenerator.FIELD_TITLE, TopicLuceneDocumentGenerator.FIELD_LABEL, TopicLuceneDocumentGenerator.FIELD_TEXT }, analyzer);
    queryParser.setDefaultOperator(QueryParser.Operator.OR);
    Query query = queryParser.parse(queryName);
    rs = searcher.search(query, numHits);
    docs = ScoredDocuments.fromTopDocs(rs, searcher);
    for (int i = 0; i < docs.documents.length; i++) {
        String resultDoc = String.format("%d - SCORE: %f\nTOPIC_MID: %s\nWIKI_TITLE: %s\nW3_LABEL: %s\n", (i + 1), docs.scores[i], docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TOPIC_MID).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TITLE).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_LABEL).stringValue());
        System.out.println(resultDoc);
    }
    LOG.info("Querying completed.");
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) TopDocs(org.apache.lucene.search.TopDocs) MultiFieldQueryParser(org.apache.lucene.queryparser.classic.MultiFieldQueryParser) QueryParser(org.apache.lucene.queryparser.classic.QueryParser) Query(org.apache.lucene.search.Query) Similarity(org.apache.lucene.search.similarities.Similarity) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity) MultiFieldQueryParser(org.apache.lucene.queryparser.classic.MultiFieldQueryParser) SimpleAnalyzer(org.apache.lucene.analysis.core.SimpleAnalyzer) ScoredDocuments(io.anserini.rerank.ScoredDocuments) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity)

Example 23 with Similarity

use of org.apache.lucene.search.similarities.Similarity in project Anserini by castorini.

the class SearchWebCollection method main.

public static void main(String[] args) throws Exception {
    SearchArgs searchArgs = new SearchArgs();
    CmdLineParser parser = new CmdLineParser(searchArgs, ParserProperties.defaults().withUsageWidth(90));
    try {
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        parser.printUsage(System.err);
        System.err.println("Example: SearchWebCollection" + parser.printExample(OptionHandlerFilter.REQUIRED));
        return;
    }
    LOG.info("Reading index at " + searchArgs.index);
    Directory dir;
    if (searchArgs.inmem) {
        LOG.info("Using MMapDirectory with preload");
        dir = new MMapDirectory(Paths.get(searchArgs.index));
        ((MMapDirectory) dir).setPreload(true);
    } else {
        LOG.info("Using default FSDirectory");
        dir = FSDirectory.open(Paths.get(searchArgs.index));
    }
    Similarity similarity = null;
    if (searchArgs.ql) {
        LOG.info("Using QL scoring model");
        similarity = new LMDirichletSimilarity(searchArgs.mu);
    } else if (searchArgs.bm25) {
        LOG.info("Using BM25 scoring model");
        similarity = new BM25Similarity(searchArgs.k1, searchArgs.b);
    } else {
        LOG.error("Error: Must specify scoring model!");
        System.exit(-1);
    }
    RerankerCascade cascade = new RerankerCascade();
    boolean useQueryParser = false;
    if (searchArgs.rm3) {
        cascade.add(new Rm3Reranker(new EnglishAnalyzer(), FIELD_BODY, "src/main/resources/io/anserini/rerank/rm3/rm3-stoplist.gov2.txt"));
        useQueryParser = true;
    } else {
        cascade.add(new IdentityReranker());
    }
    FeatureExtractors extractors = null;
    if (searchArgs.extractors != null) {
        extractors = FeatureExtractors.loadExtractor(searchArgs.extractors);
    }
    if (searchArgs.dumpFeatures) {
        PrintStream out = new PrintStream(searchArgs.featureFile);
        Qrels qrels = new Qrels(searchArgs.qrels);
        cascade.add(new WebCollectionLtrDataGenerator(out, qrels, extractors));
    }
    Path topicsFile = Paths.get(searchArgs.topics);
    if (!Files.exists(topicsFile) || !Files.isRegularFile(topicsFile) || !Files.isReadable(topicsFile)) {
        throw new IllegalArgumentException("Topics file : " + topicsFile + " does not exist or is not a (readable) file.");
    }
    TopicReader tr = (TopicReader) Class.forName("io.anserini.search.query." + searchArgs.topicReader + "TopicReader").getConstructor(Path.class).newInstance(topicsFile);
    SortedMap<Integer, String> topics = tr.read();
    final long start = System.nanoTime();
    SearchWebCollection searcher = new SearchWebCollection(searchArgs.index);
    searcher.search(topics, searchArgs.output, similarity, searchArgs.hits, cascade, useQueryParser, searchArgs.keepstop);
    searcher.close();
    final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS);
    LOG.info("Total " + topics.size() + " topics searched in " + DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss"));
}
Also used : LMDirichletSimilarity(org.apache.lucene.search.similarities.LMDirichletSimilarity) Similarity(org.apache.lucene.search.similarities.Similarity) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity) IdentityReranker(io.anserini.rerank.IdentityReranker) RerankerCascade(io.anserini.rerank.RerankerCascade) TopicReader(io.anserini.search.query.TopicReader) Rm3Reranker(io.anserini.rerank.rm3.Rm3Reranker) WebCollectionLtrDataGenerator(io.anserini.ltr.WebCollectionLtrDataGenerator) MMapDirectory(org.apache.lucene.store.MMapDirectory) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) Path(java.nio.file.Path) PrintStream(java.io.PrintStream) Qrels(io.anserini.util.Qrels) CmdLineParser(org.kohsuke.args4j.CmdLineParser) EnglishAnalyzer(org.apache.lucene.analysis.en.EnglishAnalyzer) MMapDirectory(org.apache.lucene.store.MMapDirectory) FeatureExtractors(io.anserini.ltr.feature.FeatureExtractors) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity) LMDirichletSimilarity(org.apache.lucene.search.similarities.LMDirichletSimilarity) CmdLineException(org.kohsuke.args4j.CmdLineException)

Example 24 with Similarity

use of org.apache.lucene.search.similarities.Similarity in project elasticsearch by elastic.

the class AllTermQuery method createWeight.

@Override
public Weight createWeight(IndexSearcher searcher, boolean needsScores) throws IOException {
    if (needsScores == false) {
        return new TermQuery(term).createWeight(searcher, needsScores);
    }
    final TermContext termStates = TermContext.build(searcher.getTopReaderContext(), term);
    final CollectionStatistics collectionStats = searcher.collectionStatistics(term.field());
    final TermStatistics termStats = searcher.termStatistics(term, termStates);
    final Similarity similarity = searcher.getSimilarity(needsScores);
    final SimWeight stats = similarity.computeWeight(collectionStats, termStats);
    return new Weight(this) {

        @Override
        public float getValueForNormalization() throws IOException {
            return stats.getValueForNormalization();
        }

        @Override
        public void normalize(float norm, float topLevelBoost) {
            stats.normalize(norm, topLevelBoost);
        }

        @Override
        public void extractTerms(Set<Term> terms) {
            terms.add(term);
        }

        @Override
        public Explanation explain(LeafReaderContext context, int doc) throws IOException {
            AllTermScorer scorer = scorer(context);
            if (scorer != null) {
                int newDoc = scorer.iterator().advance(doc);
                if (newDoc == doc) {
                    float score = scorer.score();
                    float freq = scorer.freq();
                    SimScorer docScorer = similarity.simScorer(stats, context);
                    Explanation freqExplanation = Explanation.match(freq, "termFreq=" + freq);
                    Explanation termScoreExplanation = docScorer.explain(doc, freqExplanation);
                    Explanation payloadBoostExplanation = Explanation.match(scorer.payloadBoost(), "payloadBoost=" + scorer.payloadBoost());
                    return Explanation.match(score, "weight(" + getQuery() + " in " + doc + ") [" + similarity.getClass().getSimpleName() + "], product of:", termScoreExplanation, payloadBoostExplanation);
                }
            }
            return Explanation.noMatch("no matching term");
        }

        @Override
        public AllTermScorer scorer(LeafReaderContext context) throws IOException {
            final Terms terms = context.reader().terms(term.field());
            if (terms == null) {
                return null;
            }
            final TermsEnum termsEnum = terms.iterator();
            if (termsEnum == null) {
                return null;
            }
            final TermState state = termStates.get(context.ord);
            if (state == null) {
                // Term does not exist in this segment
                return null;
            }
            termsEnum.seekExact(term.bytes(), state);
            PostingsEnum docs = termsEnum.postings(null, PostingsEnum.PAYLOADS);
            assert docs != null;
            return new AllTermScorer(this, docs, similarity.simScorer(stats, context));
        }
    };
}
Also used : TermQuery(org.apache.lucene.search.TermQuery) Set(java.util.Set) Similarity(org.apache.lucene.search.similarities.Similarity) Explanation(org.apache.lucene.search.Explanation) SimWeight(org.apache.lucene.search.similarities.Similarity.SimWeight) Terms(org.apache.lucene.index.Terms) SimScorer(org.apache.lucene.search.similarities.Similarity.SimScorer) TermStatistics(org.apache.lucene.search.TermStatistics) TermContext(org.apache.lucene.index.TermContext) Weight(org.apache.lucene.search.Weight) SimWeight(org.apache.lucene.search.similarities.Similarity.SimWeight) CollectionStatistics(org.apache.lucene.search.CollectionStatistics) TermsEnum(org.apache.lucene.index.TermsEnum) LeafReaderContext(org.apache.lucene.index.LeafReaderContext) TermState(org.apache.lucene.index.TermState) PostingsEnum(org.apache.lucene.index.PostingsEnum)

Example 25 with Similarity

use of org.apache.lucene.search.similarities.Similarity in project elasticsearch by elastic.

the class IndexModuleTests method testAddSimilarity.

public void testAddSimilarity() throws IOException {
    Settings indexSettings = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).put("index.similarity.my_similarity.type", "test_similarity").put("index.similarity.my_similarity.key", "there is a key").put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build();
    IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings("foo", indexSettings), new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()));
    module.addSimilarity("test_similarity", (string, providerSettings, indexLevelSettings) -> new SimilarityProvider() {

        @Override
        public String name() {
            return string;
        }

        @Override
        public Similarity get() {
            return new TestSimilarity(providerSettings.get("key"));
        }
    });
    IndexService indexService = newIndexService(module);
    SimilarityService similarityService = indexService.similarityService();
    assertNotNull(similarityService.getSimilarity("my_similarity"));
    assertTrue(similarityService.getSimilarity("my_similarity").get() instanceof TestSimilarity);
    assertEquals("my_similarity", similarityService.getSimilarity("my_similarity").name());
    assertEquals("there is a key", ((TestSimilarity) similarityService.getSimilarity("my_similarity").get()).key);
    indexService.close("simon says", false);
}
Also used : AnalysisRegistry(org.elasticsearch.index.analysis.AnalysisRegistry) SimilarityProvider(org.elasticsearch.index.similarity.SimilarityProvider) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity) Similarity(org.apache.lucene.search.similarities.Similarity) SimilarityService(org.elasticsearch.index.similarity.SimilarityService) Settings(org.elasticsearch.common.settings.Settings) ScriptSettings(org.elasticsearch.script.ScriptSettings)

Aggregations

Similarity (org.apache.lucene.search.similarities.Similarity)48 BM25Similarity (org.apache.lucene.search.similarities.BM25Similarity)15 ClassicSimilarity (org.apache.lucene.search.similarities.ClassicSimilarity)15 Directory (org.apache.lucene.store.Directory)9 PerFieldSimilarityWrapper (org.apache.lucene.search.similarities.PerFieldSimilarityWrapper)8 SweetSpotSimilarity (org.apache.lucene.misc.SweetSpotSimilarity)7 IOException (java.io.IOException)6 Document (org.apache.lucene.document.Document)5 Term (org.apache.lucene.index.Term)5 IndexSearcher (org.apache.lucene.search.IndexSearcher)5 Collectors (java.util.stream.Collectors)4 IntStream (java.util.stream.IntStream)4 Field (org.apache.lucene.document.Field)4 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)4 NormValueSource (org.apache.lucene.queries.function.valuesource.NormValueSource)4 BytesRef (org.apache.lucene.util.BytesRef)4 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)3 Store (org.apache.lucene.document.Field.Store)3 IndexWriter (org.apache.lucene.index.IndexWriter)3 ConstValueSource (org.apache.lucene.queries.function.valuesource.ConstValueSource)3