Search in sources :

Example 11 with RerankerContext

use of io.anserini.rerank.RerankerContext in project Anserini by castorini.

the class SearchCollection method searchTweets.

public <K> ScoredDocuments searchTweets(IndexSearcher searcher, K qid, String queryString, long t, RerankerCascade cascade, ScoredDocuments queryQrels, boolean hasRelDocs) throws IOException {
    Query keywordQuery;
    if (args.sdm) {
        keywordQuery = new SdmQueryGenerator(args.sdm_tw, args.sdm_ow, args.sdm_uw).buildQuery(IndexArgs.CONTENTS, analyzer, queryString);
    } else {
        try {
            QueryGenerator generator = (QueryGenerator) Class.forName("io.anserini.search.query." + args.queryGenerator).getConstructor().newInstance();
            keywordQuery = generator.buildQuery(IndexArgs.CONTENTS, analyzer, queryString);
        } catch (Exception e) {
            e.printStackTrace();
            throw new IllegalArgumentException("Unable to load QueryGenerator: " + args.topicReader);
        }
    }
    List<String> queryTokens = AnalyzerUtils.analyze(analyzer, queryString);
    // Do not consider the tweets with tweet ids that are beyond the queryTweetTime
    // <querytweettime> tag contains the timestamp of the query in terms of the
    // chronologically nearest tweet id within the corpus
    Query filter = LongPoint.newRangeQuery(TweetGenerator.TweetField.ID_LONG.name, 0L, t);
    BooleanQuery.Builder builder = new BooleanQuery.Builder();
    builder.add(filter, BooleanClause.Occur.FILTER);
    builder.add(keywordQuery, BooleanClause.Occur.MUST);
    Query compositeQuery = builder.build();
    TopDocs rs = new TopDocs(new TotalHits(0, TotalHits.Relation.EQUAL_TO), new ScoreDoc[] {});
    if (!isRerank || (args.rerankcutoff > 0 && args.rf_qrels == null) || (args.rf_qrels != null && !hasRelDocs)) {
        if (args.arbitraryScoreTieBreak) {
            // Figure out how to break the scoring ties.
            rs = searcher.search(compositeQuery, (isRerank && args.rf_qrels == null) ? args.rerankcutoff : args.hits);
        } else {
            rs = searcher.search(compositeQuery, (isRerank && args.rf_qrels == null) ? args.rerankcutoff : args.hits, BREAK_SCORE_TIES_BY_TWEETID, true);
        }
    }
    RerankerContext context = new RerankerContext<>(searcher, qid, keywordQuery, null, queryString, queryTokens, filter, args);
    ScoredDocuments scoredFbDocs;
    if (isRerank && args.rf_qrels != null) {
        if (hasRelDocs) {
            scoredFbDocs = queryQrels;
        } else {
            // if no relevant documents, only perform score based tie breaking next
            scoredFbDocs = ScoredDocuments.fromTopDocs(rs, searcher);
            cascade = new RerankerCascade();
            cascade.add(new ScoreTiesAdjusterReranker());
        }
    } else {
        scoredFbDocs = ScoredDocuments.fromTopDocs(rs, searcher);
    }
    return cascade.run(scoredFbDocs, context);
}
Also used : TotalHits(org.apache.lucene.search.TotalHits) BooleanQuery(org.apache.lucene.search.BooleanQuery) Query(org.apache.lucene.search.Query) TermInSetQuery(org.apache.lucene.search.TermInSetQuery) BooleanQuery(org.apache.lucene.search.BooleanQuery) ScoredDocuments(io.anserini.rerank.ScoredDocuments) QueryNodeException(org.apache.lucene.queryparser.flexible.core.QueryNodeException) IOException(java.io.IOException) CompletionException(java.util.concurrent.CompletionException) CmdLineException(org.kohsuke.args4j.CmdLineException) AtomicMoveNotSupportedException(java.nio.file.AtomicMoveNotSupportedException) TopDocs(org.apache.lucene.search.TopDocs) RerankerCascade(io.anserini.rerank.RerankerCascade) QueryGenerator(io.anserini.search.query.QueryGenerator) SdmQueryGenerator(io.anserini.search.query.SdmQueryGenerator) ScoreTiesAdjusterReranker(io.anserini.rerank.lib.ScoreTiesAdjusterReranker) SdmQueryGenerator(io.anserini.search.query.SdmQueryGenerator) RerankerContext(io.anserini.rerank.RerankerContext)

Example 12 with RerankerContext

use of io.anserini.rerank.RerankerContext in project Anserini by castorini.

the class SearchCollection method search.

public <K> ScoredDocuments search(IndexSearcher searcher, K qid, String queryString, RerankerCascade cascade, ScoredDocuments queryQrels, boolean hasRelDocs) throws IOException {
    Query query = null;
    if (args.sdm) {
        query = new SdmQueryGenerator(args.sdm_tw, args.sdm_ow, args.sdm_uw).buildQuery(IndexArgs.CONTENTS, analyzer, queryString);
    } else {
        QueryGenerator generator;
        try {
            generator = (QueryGenerator) Class.forName("io.anserini.search.query." + args.queryGenerator).getConstructor().newInstance();
        } catch (Exception e) {
            e.printStackTrace();
            throw new IllegalArgumentException("Unable to load QueryGenerator: " + args.topicReader);
        }
        query = generator.buildQuery(IndexArgs.CONTENTS, analyzer, queryString);
    }
    TopDocs rs = new TopDocs(new TotalHits(0, TotalHits.Relation.EQUAL_TO), new ScoreDoc[] {});
    if (!isRerank || (args.rerankcutoff > 0 && args.rf_qrels == null) || (args.rf_qrels != null && !hasRelDocs)) {
        if (args.arbitraryScoreTieBreak) {
            // Figure out how to break the scoring ties.
            rs = searcher.search(query, (isRerank && args.rf_qrels == null) ? args.rerankcutoff : args.hits);
        } else {
            rs = searcher.search(query, (isRerank && args.rf_qrels == null) ? args.rerankcutoff : args.hits, BREAK_SCORE_TIES_BY_DOCID, true);
        }
    }
    List<String> queryTokens = AnalyzerUtils.analyze(analyzer, queryString);
    queries.put(qid.toString(), queryTokens);
    RerankerContext context = new RerankerContext<>(searcher, qid, query, null, queryString, queryTokens, null, args);
    ScoredDocuments scoredFbDocs;
    if (isRerank && args.rf_qrels != null) {
        if (hasRelDocs) {
            scoredFbDocs = queryQrels;
        } else {
            // if no relevant documents, only perform score based tie breaking next
            LOG.info("No relevant documents for " + qid.toString());
            scoredFbDocs = ScoredDocuments.fromTopDocs(rs, searcher);
            cascade = new RerankerCascade();
            cascade.add(new ScoreTiesAdjusterReranker());
        }
    } else {
        scoredFbDocs = ScoredDocuments.fromTopDocs(rs, searcher);
    }
    return cascade.run(scoredFbDocs, context);
}
Also used : TotalHits(org.apache.lucene.search.TotalHits) Query(org.apache.lucene.search.Query) TermInSetQuery(org.apache.lucene.search.TermInSetQuery) BooleanQuery(org.apache.lucene.search.BooleanQuery) ScoredDocuments(io.anserini.rerank.ScoredDocuments) QueryNodeException(org.apache.lucene.queryparser.flexible.core.QueryNodeException) IOException(java.io.IOException) CompletionException(java.util.concurrent.CompletionException) CmdLineException(org.kohsuke.args4j.CmdLineException) AtomicMoveNotSupportedException(java.nio.file.AtomicMoveNotSupportedException) TopDocs(org.apache.lucene.search.TopDocs) RerankerCascade(io.anserini.rerank.RerankerCascade) QueryGenerator(io.anserini.search.query.QueryGenerator) SdmQueryGenerator(io.anserini.search.query.SdmQueryGenerator) ScoreTiesAdjusterReranker(io.anserini.rerank.lib.ScoreTiesAdjusterReranker) SdmQueryGenerator(io.anserini.search.query.SdmQueryGenerator) RerankerContext(io.anserini.rerank.RerankerContext)

Example 13 with RerankerContext

use of io.anserini.rerank.RerankerContext in project Anserini by castorini.

the class AxiomReranker method processExternalContext.

/**
 * If the external reranking context is not null we will first search against the external
 * index and return the top ranked documents.
 *
 * @param docs The initial ranking results against target index. We will return them if external
 *             index is null.
 *
 * @return Top ranked ScoredDocuments from searching external index
 */
private ScoredDocuments processExternalContext(ScoredDocuments docs, RerankerContext<T> context) throws IOException {
    if (this.externalIndexPath != null) {
        Path indexPath = Paths.get(this.externalIndexPath);
        if (!Files.exists(indexPath) || !Files.isDirectory(indexPath) || !Files.isReadable(indexPath)) {
            throw new IllegalArgumentException(this.externalIndexPath + " does not exist or is not a directory.");
        }
        IndexReader reader = DirectoryReader.open(FSDirectory.open(indexPath));
        IndexSearcher searcher = new IndexSearcher(reader);
        searcher.setSimilarity(context.getIndexSearcher().getSimilarity());
        SearchArgs args = new SearchArgs();
        args.hits = this.R;
        args.arbitraryScoreTieBreak = context.getSearchArgs().arbitraryScoreTieBreak;
        args.searchtweets = context.getSearchArgs().searchtweets;
        RerankerContext<T> externalContext = new RerankerContext<>(searcher, context.getQueryId(), context.getQuery(), context.getQueryDocId(), context.getQueryText(), context.getQueryTokens(), context.getFilter(), args);
        return searchTopDocs(null, externalContext);
    } else {
        return docs;
    }
}
Also used : Path(java.nio.file.Path) IndexSearcher(org.apache.lucene.search.IndexSearcher) SearchArgs(io.anserini.search.SearchArgs) IndexReader(org.apache.lucene.index.IndexReader) RerankerContext(io.anserini.rerank.RerankerContext)

Example 14 with RerankerContext

use of io.anserini.rerank.RerankerContext in project Anserini by castorini.

the class SimpleImpactSearcher method _search.

// internal implementation
protected Result[] _search(Query query, int k) throws IOException {
    // Create an IndexSearch only once. Note that the object is thread safe.
    if (searcher == null) {
        searcher = new IndexSearcher(reader);
        searcher.setSimilarity(similarity);
    }
    SearchArgs searchArgs = new SearchArgs();
    searchArgs.arbitraryScoreTieBreak = false;
    searchArgs.hits = k;
    TopDocs rs;
    RerankerContext context;
    rs = searcher.search(query, k, BREAK_SCORE_TIES_BY_DOCID, true);
    context = new RerankerContext<>(searcher, null, query, null, null, null, null, searchArgs);
    ScoredDocuments hits = cascade.run(ScoredDocuments.fromTopDocs(rs, searcher), context);
    Result[] results = new Result[hits.ids.length];
    for (int i = 0; i < hits.ids.length; i++) {
        Document doc = hits.documents[i];
        String docid = doc.getField(IndexArgs.ID).stringValue();
        IndexableField field;
        field = doc.getField(IndexArgs.CONTENTS);
        String contents = field == null ? null : field.stringValue();
        field = doc.getField(IndexArgs.RAW);
        String raw = field == null ? null : field.stringValue();
        results[i] = new Result(docid, hits.ids[i], hits.scores[i], contents, raw, doc);
    }
    return results;
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) TopDocs(org.apache.lucene.search.TopDocs) IndexableField(org.apache.lucene.index.IndexableField) ScoredDocuments(io.anserini.rerank.ScoredDocuments) Document(org.apache.lucene.document.Document) RerankerContext(io.anserini.rerank.RerankerContext)

Example 15 with RerankerContext

use of io.anserini.rerank.RerankerContext in project Anserini by castorini.

the class SimpleTweetSearcher method searchTweets.

protected Result[] searchTweets(Query query, List<String> queryTokens, String queryString, int k, long t) throws IOException {
    // Create an IndexSearch only once. Note that the object is thread safe.
    if (searcher == null) {
        searcher = new IndexSearcher(reader);
        searcher.setSimilarity(similarity);
    }
    SearchArgs searchArgs = new SearchArgs();
    searchArgs.arbitraryScoreTieBreak = false;
    searchArgs.hits = k;
    searchArgs.searchtweets = true;
    TopDocs rs;
    RerankerContext context;
    // Do not consider the tweets with tweet ids that are beyond the queryTweetTime
    // <querytweettime> tag contains the timestamp of the query in terms of the
    // chronologically nearest tweet id within the corpus
    Query filter = LongPoint.newRangeQuery(TweetGenerator.TweetField.ID_LONG.name, 0L, t);
    BooleanQuery.Builder builder = new BooleanQuery.Builder();
    builder.add(filter, BooleanClause.Occur.FILTER);
    builder.add(query, BooleanClause.Occur.MUST);
    Query compositeQuery = builder.build();
    rs = searcher.search(compositeQuery, useRM3 ? searchArgs.rerankcutoff : k, BREAK_SCORE_TIES_BY_TWEETID, true);
    context = new RerankerContext<>(searcher, null, compositeQuery, null, queryString, queryTokens, filter, searchArgs);
    ScoredDocuments hits = cascade.run(ScoredDocuments.fromTopDocs(rs, searcher), context);
    Result[] results = new Result[hits.ids.length];
    for (int i = 0; i < hits.ids.length; i++) {
        Document doc = hits.documents[i];
        String docid = doc.getField(IndexArgs.ID).stringValue();
        IndexableField field;
        field = doc.getField(IndexArgs.CONTENTS);
        String contents = field == null ? null : field.stringValue();
        field = doc.getField(IndexArgs.RAW);
        String raw = field == null ? null : field.stringValue();
        results[i] = new Result(docid, hits.ids[i], hits.scores[i], contents, raw, doc);
    }
    return results;
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) BooleanQuery(org.apache.lucene.search.BooleanQuery) Query(org.apache.lucene.search.Query) BooleanQuery(org.apache.lucene.search.BooleanQuery) ScoredDocuments(io.anserini.rerank.ScoredDocuments) Document(org.apache.lucene.document.Document) LongPoint(org.apache.lucene.document.LongPoint) TopDocs(org.apache.lucene.search.TopDocs) IndexableField(org.apache.lucene.index.IndexableField) RerankerContext(io.anserini.rerank.RerankerContext)

Aggregations

RerankerContext (io.anserini.rerank.RerankerContext)15 ScoredDocuments (io.anserini.rerank.ScoredDocuments)9 TopDocs (org.apache.lucene.search.TopDocs)9 IndexSearcher (org.apache.lucene.search.IndexSearcher)8 Query (org.apache.lucene.search.Query)7 Document (org.apache.lucene.document.Document)6 FeatureExtractors (io.anserini.ltr.feature.FeatureExtractors)4 RerankerCascade (io.anserini.rerank.RerankerCascade)4 IndexReader (org.apache.lucene.index.IndexReader)4 BooleanQuery (org.apache.lucene.search.BooleanQuery)4 EnglishAnalyzer (org.apache.lucene.analysis.en.EnglishAnalyzer)3 IndexableField (org.apache.lucene.index.IndexableField)3 Terms (org.apache.lucene.index.Terms)3 QueryNodeException (org.apache.lucene.queryparser.flexible.core.QueryNodeException)3 ScoreDoc (org.apache.lucene.search.ScoreDoc)3 TermInSetQuery (org.apache.lucene.search.TermInSetQuery)3 CmdLineException (org.kohsuke.args4j.CmdLineException)3 ScoreTiesAdjusterReranker (io.anserini.rerank.lib.ScoreTiesAdjusterReranker)2 RemoveRetweetsTemporalTiebreakReranker (io.anserini.rerank.twitter.RemoveRetweetsTemporalTiebreakReranker)2 QueryGenerator (io.anserini.search.query.QueryGenerator)2