Search in sources :

Example 6 with RerankerContext

use of io.anserini.rerank.RerankerContext in project Anserini by castorini.

the class SimpleSearcher method search.

// internal implementation
protected Result[] search(Query query, List<String> queryTokens, String queryString, int k) throws IOException {
    // Create an IndexSearch only once. Note that the object is thread safe.
    if (searcher == null) {
        searcher = new IndexSearcher(reader);
        searcher.setSimilarity(similarity);
    }
    SearchArgs searchArgs = new SearchArgs();
    searchArgs.arbitraryScoreTieBreak = false;
    searchArgs.hits = k;
    TopDocs rs;
    RerankerContext context;
    rs = searcher.search(query, useRM3 ? searchArgs.rerankcutoff : k, BREAK_SCORE_TIES_BY_DOCID, true);
    context = new RerankerContext<>(searcher, null, query, null, queryString, queryTokens, null, searchArgs);
    ScoredDocuments hits = cascade.run(ScoredDocuments.fromTopDocs(rs, searcher), context);
    Result[] results = new Result[hits.ids.length];
    for (int i = 0; i < hits.ids.length; i++) {
        Document doc = hits.documents[i];
        String docid = doc.getField(IndexArgs.ID).stringValue();
        IndexableField field;
        field = doc.getField(IndexArgs.CONTENTS);
        String contents = field == null ? null : field.stringValue();
        field = doc.getField(IndexArgs.RAW);
        String raw = field == null ? null : field.stringValue();
        results[i] = new Result(docid, hits.ids[i], hits.scores[i], contents, raw, doc);
    }
    return results;
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) TopDocs(org.apache.lucene.search.TopDocs) IndexableField(org.apache.lucene.index.IndexableField) ScoredDocuments(io.anserini.rerank.ScoredDocuments) Document(org.apache.lucene.document.Document) RerankerContext(io.anserini.rerank.RerankerContext)

Example 7 with RerankerContext

use of io.anserini.rerank.RerankerContext in project Anserini by castorini.

the class DumpTweetsLtrData method main.

public static void main(String[] argv) throws Exception {
    long curTime = System.nanoTime();
    LtrArgs args = new LtrArgs();
    CmdLineParser parser = new CmdLineParser(args, ParserProperties.defaults().withUsageWidth(90));
    try {
        parser.parseArgument(argv);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        parser.printUsage(System.err);
        System.err.println("Example: DumpTweetsLtrData" + parser.printExample(OptionHandlerFilter.REQUIRED));
        return;
    }
    LOG.info("Reading index at " + args.index);
    Directory dir = FSDirectory.open(Paths.get(args.index));
    IndexReader reader = DirectoryReader.open(dir);
    IndexSearcher searcher = new IndexSearcher(reader);
    if (args.ql) {
        LOG.info("Using QL scoring model");
        searcher.setSimilarity(new LMDirichletSimilarity(args.mu));
    } else if (args.bm25) {
        LOG.info("Using BM25 scoring model");
        searcher.setSimilarity(new BM25Similarity(args.k1, args.b));
    } else {
        LOG.error("Error: Must specify scoring model!");
        System.exit(-1);
    }
    Qrels qrels = new Qrels(args.qrels);
    FeatureExtractors extractors = null;
    if (args.extractors != null) {
        extractors = FeatureExtractors.loadExtractor(args.extractors);
    }
    PrintStream out = new PrintStream(new FileOutputStream(new File(args.output)));
    RerankerCascade cascade = new RerankerCascade();
    cascade.add(new RemoveRetweetsTemporalTiebreakReranker());
    cascade.add(new TweetsLtrDataGenerator(out, qrels, extractors));
    MicroblogTopicSet topics = MicroblogTopicSet.fromFile(new File(args.topics));
    LOG.info("Initialized complete! (elapsed time = " + (System.nanoTime() - curTime) / 1000000 + "ms)");
    long totalTime = 0;
    int cnt = 0;
    for (MicroblogTopic topic : topics) {
        long curQueryTime = System.nanoTime();
        Query filter = LongPoint.newRangeQuery(StatusField.ID.name, 0L, topic.getQueryTweetTime());
        Query query = AnalyzerUtils.buildBagOfWordsQuery(StatusField.TEXT.name, IndexTweets.ANALYZER, topic.getQuery());
        BooleanQuery.Builder builder = new BooleanQuery.Builder();
        builder.add(filter, BooleanClause.Occur.FILTER);
        builder.add(query, BooleanClause.Occur.MUST);
        Query q = builder.build();
        TopDocs rs = searcher.search(q, args.hits);
        List<String> queryTokens = AnalyzerUtils.tokenize(IndexTweets.ANALYZER, topic.getQuery());
        RerankerContext context = new RerankerContext(searcher, query, topic.getId(), topic.getQuery(), queryTokens, StatusField.TEXT.name, filter);
        cascade.run(ScoredDocuments.fromTopDocs(rs, searcher), context);
        long qtime = (System.nanoTime() - curQueryTime) / 1000000;
        LOG.info("Query " + topic.getId() + " (elapsed time = " + qtime + "ms)");
        totalTime += qtime;
        cnt++;
    }
    LOG.info("All queries completed!");
    LOG.info("Total elapsed time = " + totalTime + "ms");
    LOG.info("Average query latency = " + (totalTime / cnt) + "ms");
    reader.close();
    out.close();
}
Also used : RemoveRetweetsTemporalTiebreakReranker(io.anserini.rerank.twitter.RemoveRetweetsTemporalTiebreakReranker) RerankerCascade(io.anserini.rerank.RerankerCascade) MicroblogTopicSet(io.anserini.search.MicroblogTopicSet) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) Qrels(io.anserini.util.Qrels) PrintStream(java.io.PrintStream) LongPoint(org.apache.lucene.document.LongPoint) FeatureExtractors(io.anserini.ltr.feature.FeatureExtractors) FileOutputStream(java.io.FileOutputStream) IndexReader(org.apache.lucene.index.IndexReader) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity) MicroblogTopic(io.anserini.search.MicroblogTopic) LMDirichletSimilarity(org.apache.lucene.search.similarities.LMDirichletSimilarity) File(java.io.File) RerankerContext(io.anserini.rerank.RerankerContext)

Example 8 with RerankerContext

use of io.anserini.rerank.RerankerContext in project Anserini by castorini.

the class PyseriniEntryPoint method search.

/**
 * Prints TREC submission file to the standard output stream.
 *
 * @param topics     queries
 * @param similarity similarity
 * @throws IOException
 * @throws ParseException
 */
public Map<String, Float> search(SortedMap<Integer, String> topics, Similarity similarity, int numHits, RerankerCascade cascade, boolean useQueryParser, boolean keepstopwords) throws IOException, ParseException {
    Map<String, Float> scoredDocs = new LinkedHashMap<>();
    IndexSearcher searcher = new IndexSearcher(reader);
    searcher.setSimilarity(similarity);
    EnglishAnalyzer ea = keepstopwords ? new EnglishAnalyzer(CharArraySet.EMPTY_SET) : new EnglishAnalyzer();
    QueryParser queryParser = new QueryParser(FIELD_BODY, ea);
    queryParser.setDefaultOperator(QueryParser.Operator.OR);
    for (Map.Entry<Integer, String> entry : topics.entrySet()) {
        int qID = entry.getKey();
        String queryString = entry.getValue();
        Query query = useQueryParser ? queryParser.parse(queryString) : AnalyzerUtils.buildBagOfWordsQuery(FIELD_BODY, ea, queryString);
        TopDocs rs = searcher.search(query, numHits);
        ScoreDoc[] hits = rs.scoreDocs;
        List<String> queryTokens = AnalyzerUtils.tokenize(ea, queryString);
        RerankerContext context = new RerankerContext(searcher, query, String.valueOf(qID), queryString, queryTokens, FIELD_BODY, null);
        ScoredDocuments docs = cascade.run(ScoredDocuments.fromTopDocs(rs, searcher), context);
        for (int i = 0; i < docs.documents.length; i++) {
            String docid = docs.documents[i].getField(FIELD_ID).stringValue();
            float score = docs.scores[i];
            scoredDocs.put(docid, score);
        }
    }
    return scoredDocs;
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) Query(org.apache.lucene.search.Query) ScoredDocuments(io.anserini.rerank.ScoredDocuments) EnglishAnalyzer(org.apache.lucene.analysis.en.EnglishAnalyzer) ScoreDoc(org.apache.lucene.search.ScoreDoc) TopDocs(org.apache.lucene.search.TopDocs) QueryParser(org.apache.lucene.queryparser.classic.QueryParser) RerankerContext(io.anserini.rerank.RerankerContext)

Example 9 with RerankerContext

use of io.anserini.rerank.RerankerContext in project Anserini by castorini.

the class SearchWebCollection method search.

/**
 * Prints TREC submission file to the standard output stream.
 *
 * @param topics     queries
 * @param similarity similarity
 * @throws IOException
 * @throws ParseException
 */
public void search(SortedMap<Integer, String> topics, String submissionFile, Similarity similarity, int numHits, RerankerCascade cascade, boolean useQueryParser, boolean keepstopwords) throws IOException, ParseException {
    IndexSearcher searcher = new IndexSearcher(reader);
    searcher.setSimilarity(similarity);
    final String runTag = "BM25_EnglishAnalyzer_" + (keepstopwords ? "KeepStopwords_" : "") + FIELD_BODY + "_" + similarity.toString();
    PrintWriter out = new PrintWriter(Files.newBufferedWriter(Paths.get(submissionFile), StandardCharsets.US_ASCII));
    EnglishAnalyzer ea = keepstopwords ? new EnglishAnalyzer(CharArraySet.EMPTY_SET) : new EnglishAnalyzer();
    QueryParser queryParser = new QueryParser(FIELD_BODY, ea);
    queryParser.setDefaultOperator(QueryParser.Operator.OR);
    for (Map.Entry<Integer, String> entry : topics.entrySet()) {
        int qID = entry.getKey();
        String queryString = entry.getValue();
        Query query = useQueryParser ? queryParser.parse(queryString) : AnalyzerUtils.buildBagOfWordsQuery(FIELD_BODY, ea, queryString);
        /**
         * For Web Tracks 2010,2011,and 2012; an experimental run consists of the top 10,000 documents for each topic query.
         */
        TopDocs rs = searcher.search(query, numHits);
        ScoreDoc[] hits = rs.scoreDocs;
        List<String> queryTokens = AnalyzerUtils.tokenize(ea, queryString);
        RerankerContext context = new RerankerContext(searcher, query, String.valueOf(qID), queryString, queryTokens, FIELD_BODY, null);
        ScoredDocuments docs = cascade.run(ScoredDocuments.fromTopDocs(rs, searcher), context);
        /**
         * the first column is the topic number.
         * the second column is currently unused and should always be "Q0".
         * the third column is the official document identifier of the retrieved document.
         * the fourth column is the rank the document is retrieved.
         * the fifth column shows the score (integer or floating point) that generated the ranking.
         * the sixth column is called the "run tag" and should be a unique identifier for your
         */
        for (int i = 0; i < docs.documents.length; i++) {
            out.println(String.format("%d Q0 %s %d %f %s", qID, docs.documents[i].getField(FIELD_ID).stringValue(), (i + 1), docs.scores[i], runTag));
        }
    }
    out.flush();
    out.close();
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) Query(org.apache.lucene.search.Query) ScoredDocuments(io.anserini.rerank.ScoredDocuments) EnglishAnalyzer(org.apache.lucene.analysis.en.EnglishAnalyzer) ScoreDoc(org.apache.lucene.search.ScoreDoc) TopDocs(org.apache.lucene.search.TopDocs) QueryParser(org.apache.lucene.queryparser.classic.QueryParser) Map(java.util.Map) SortedMap(java.util.SortedMap) RerankerContext(io.anserini.rerank.RerankerContext) PrintWriter(java.io.PrintWriter)

Example 10 with RerankerContext

use of io.anserini.rerank.RerankerContext in project Anserini by castorini.

the class BaseFeatureExtractorTest method assertFeatureValues.

/**
 * Used to test features involving multiple documents in the collection at the same time
 * @param expected            An array of expected values for the computed features
 * @param queryText           Query
 * @param docTexts            A list of document texts representing documents in the collection
 * @param extractors          The chain of feature extractors to use
 * @param docToExtract        Index of the document we want to compute features for
 */
protected void assertFeatureValues(float[] expected, String queryText, List<String> docTexts, FeatureExtractors extractors, int docToExtract) throws IOException {
    List<Document> addedDocs = new ArrayList<>();
    for (String docText : docTexts) {
        Document testDoc = addTestDocument(docText);
        addedDocs.add(testDoc);
    }
    testWriter.forceMerge(1);
    Document testDoc = addedDocs.get(docToExtract);
    RerankerContext context = makeTestContext(queryText);
    IndexReader reader = context.getIndexSearcher().getIndexReader();
    Terms terms = reader.getTermVector(docToExtract, TEST_FIELD_NAME);
    float[] extractedFeatureValues = extractors.extractAll(testDoc, terms, context);
    assertArrayEquals(expected, extractedFeatureValues, DELTA);
}
Also used : ArrayList(java.util.ArrayList) IndexReader(org.apache.lucene.index.IndexReader) Terms(org.apache.lucene.index.Terms) Document(org.apache.lucene.document.Document) RerankerContext(io.anserini.rerank.RerankerContext)

Aggregations

RerankerContext (io.anserini.rerank.RerankerContext)15 ScoredDocuments (io.anserini.rerank.ScoredDocuments)9 TopDocs (org.apache.lucene.search.TopDocs)9 IndexSearcher (org.apache.lucene.search.IndexSearcher)8 Query (org.apache.lucene.search.Query)7 Document (org.apache.lucene.document.Document)6 FeatureExtractors (io.anserini.ltr.feature.FeatureExtractors)4 RerankerCascade (io.anserini.rerank.RerankerCascade)4 IndexReader (org.apache.lucene.index.IndexReader)4 BooleanQuery (org.apache.lucene.search.BooleanQuery)4 EnglishAnalyzer (org.apache.lucene.analysis.en.EnglishAnalyzer)3 IndexableField (org.apache.lucene.index.IndexableField)3 Terms (org.apache.lucene.index.Terms)3 QueryNodeException (org.apache.lucene.queryparser.flexible.core.QueryNodeException)3 ScoreDoc (org.apache.lucene.search.ScoreDoc)3 TermInSetQuery (org.apache.lucene.search.TermInSetQuery)3 CmdLineException (org.kohsuke.args4j.CmdLineException)3 ScoreTiesAdjusterReranker (io.anserini.rerank.lib.ScoreTiesAdjusterReranker)2 RemoveRetweetsTemporalTiebreakReranker (io.anserini.rerank.twitter.RemoveRetweetsTemporalTiebreakReranker)2 QueryGenerator (io.anserini.search.query.QueryGenerator)2