Search in sources :

Example 76 with QueryParser

use of org.apache.lucene.queryparser.classic.QueryParser in project Anserini by castorini.

the class Rm3Reranker method rerank.

@Override
public ScoredDocuments rerank(ScoredDocuments docs, RerankerContext context) {
    Preconditions.checkState(docs.documents.length == docs.scores.length);
    IndexSearcher searcher = context.getIndexSearcher();
    IndexReader reader = searcher.getIndexReader();
    FeatureVector qfv = FeatureVector.fromTerms(AnalyzerUtils.tokenize(analyzer, context.getQueryText())).scaleToUnitL1Norm();
    FeatureVector rm = estimateRelevanceModel(docs, reader);
    LOG.info("Relevance model estimated.");
    rm = FeatureVector.interpolate(qfv, rm, originalQueryWeight);
    StringBuilder builder = new StringBuilder();
    Iterator<String> terms = rm.iterator();
    while (terms.hasNext()) {
        String term = terms.next();
        double prob = rm.getFeatureWeight(term);
        builder.append(term + "^" + prob + " ");
    }
    String queryText = builder.toString().trim();
    QueryParser p = new QueryParser(field, new WhitespaceAnalyzer());
    Query nq = null;
    try {
        nq = p.parse(queryText);
    } catch (ParseException e) {
        e.printStackTrace();
        return docs;
    }
    LOG.info("Running new query: " + nq);
    TopDocs rs = null;
    try {
        if (context.getFilter() == null) {
            rs = searcher.search(nq, 1000);
        } else {
            BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder();
            bqBuilder.add(context.getFilter(), BooleanClause.Occur.FILTER);
            bqBuilder.add(nq, BooleanClause.Occur.MUST);
            Query q = bqBuilder.build();
            rs = searcher.search(q, 1000);
        }
    } catch (IOException e) {
        e.printStackTrace();
        return docs;
    }
    return ScoredDocuments.fromTopDocs(rs, searcher);
}
Also used : FeatureVector(io.anserini.util.FeatureVector) WhitespaceAnalyzer(org.apache.lucene.analysis.core.WhitespaceAnalyzer) IOException(java.io.IOException) QueryParser(org.apache.lucene.queryparser.classic.QueryParser) IndexReader(org.apache.lucene.index.IndexReader) ParseException(org.apache.lucene.queryparser.classic.ParseException)

Example 77 with QueryParser

use of org.apache.lucene.queryparser.classic.QueryParser in project Anserini by castorini.

the class EntityLinking method search.

/**
 * Returns a list of query results.
 *
 * @param queryName the entity name to search
 * @throws Exception on error
 * @return a list of top ranked entities
 */
public List<RankedEntity> search(String queryName, int numHits) throws Exception {
    List<RankedEntity> rankedEntities = new ArrayList<>();
    // Initialize index searcher
    IndexSearcher searcher = new IndexSearcher(reader);
    // do exact search on query name
    QueryParser queryParser = new QueryParser(IndexTopics.FIELD_NAME, new SimpleAnalyzer());
    queryParser.setAutoGeneratePhraseQueries(true);
    queryParser.setPhraseSlop(3);
    queryName = "\"" + queryName + "\"";
    Query query = queryParser.parse(queryName);
    TopDocs rs = searcher.search(query, numHits);
    ScoredDocuments docs = ScoredDocuments.fromTopDocs(rs, searcher);
    for (int i = 0; i < docs.documents.length; i++) {
        float score = docs.scores[i];
        String mid = docs.documents[i].getField(IndexTopics.FIELD_TOPIC_MID).stringValue();
        String shortMid = getShortMid(mid);
        String name = docs.documents[i].getField(IndexTopics.FIELD_NAME).stringValue();
        String label = docs.documents[i].getField(IndexTopics.FIELD_LABEL).stringValue();
        rankedEntities.add(new RankedEntity(shortMid, score, name, label));
    }
    if (docs.documents.length >= numHits) {
        return rankedEntities;
    }
    int numHitsLeft = numHits - docs.documents.length;
    // do TFIDF search
    Similarity similarity = new ClassicSimilarity();
    searcher.setSimilarity(similarity);
    queryParser = new MultiFieldQueryParser(new String[] { IndexTopics.FIELD_NAME, IndexTopics.FIELD_LABEL }, new SimpleAnalyzer());
    queryParser.setDefaultOperator(QueryParser.Operator.AND);
    query = queryParser.parse(queryName);
    rs = searcher.search(query, numHitsLeft);
    docs = ScoredDocuments.fromTopDocs(rs, searcher);
    for (int i = 0; i < docs.documents.length; i++) {
        float score = docs.scores[i];
        String mid = docs.documents[i].getField(IndexTopics.FIELD_TOPIC_MID).stringValue();
        String shortMid = getShortMid(mid);
        String name = docs.documents[i].getField(IndexTopics.FIELD_NAME).stringValue();
        String label = docs.documents[i].getField(IndexTopics.FIELD_LABEL).stringValue();
        rankedEntities.add(new RankedEntity(shortMid, score, name, label));
    }
    return rankedEntities;
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) ClassicSimilarity(org.apache.lucene.search.similarities.ClassicSimilarity) Query(org.apache.lucene.search.Query) TermQuery(org.apache.lucene.search.TermQuery) Similarity(org.apache.lucene.search.similarities.Similarity) ClassicSimilarity(org.apache.lucene.search.similarities.ClassicSimilarity) MultiFieldQueryParser(org.apache.lucene.queryparser.classic.MultiFieldQueryParser) SimpleAnalyzer(org.apache.lucene.analysis.core.SimpleAnalyzer) ArrayList(java.util.ArrayList) ScoredDocuments(io.anserini.rerank.ScoredDocuments) TopDocs(org.apache.lucene.search.TopDocs) MultiFieldQueryParser(org.apache.lucene.queryparser.classic.MultiFieldQueryParser) QueryParser(org.apache.lucene.queryparser.classic.QueryParser)

Example 78 with QueryParser

use of org.apache.lucene.queryparser.classic.QueryParser in project Anserini by castorini.

the class TrainingDataGenerator method birthdate.

/**
 * Generate training data for property birth date
 * <p>
 * Note: this function might need some refactoring when we add more properties
 */
void birthdate() throws ParseException, IOException {
    QueryParser queryParser = new QueryParser(FIELD_BIRTHDATE, getKbIndexAnalyzer());
    queryParser.setAllowLeadingWildcard(true);
    Query q = queryParser.parse("*");
    LOG.info("Starting the search using query: {}", q.toString());
    // Collect all matching documents in a set of matching doc ids
    Set<Integer> matchingDocIds = new HashSet<>();
    getKbIndexSearcher().search(q, new CheckHits.SetCollector(matchingDocIds));
    LOG.info("Found {} matching documents, retrieving...", matchingDocIds.size());
    // Process the retrieved document ids
    matchingDocIds.forEach((Integer docId) -> {
        Document doc = null;
        try {
            doc = getKbIndexReader().document(docId);
        } catch (IOException e) {
            LOG.warn("Error retrieving document with id: {}. Ignoring.", docId);
            return;
        }
        String freebaseURI = doc.get(IndexNodes.FIELD_ID);
        // We might have multiple values for the field
        String[] birthdates = doc.getValues(FIELD_BIRTHDATE);
        // Get the freebase English label of this entity
        String[] labels = doc.getValues(FIELD_LABEL);
        String englishLabel = null;
        for (String label : labels) {
            Literal literal = NTriplesUtil.parseLiteral(label, valueFactory);
            if (literal.getLanguage().orElse("N/A").toLowerCase().equals("en")) {
                englishLabel = literal.stringValue();
                break;
            }
        }
        // Basically make sure label is not null, for some entities in freebase
        if (englishLabel == null || freebaseURI == null || birthdates == null || birthdates.length == 0)
            // Ignore this search
            return;
        String freebaseId = freebaseUriToFreebaseId(freebaseURI);
        for (String birthdate : birthdates) {
            // Get string value
            String birthdateVal = extractValueFromTypedLiteralString(birthdate);
            // Write property value as training data
            writeToTrainingFile(TRAINING_DATA_OUTPUT_FILE_EXAMPLES, freebaseId, englishLabel, birthdateVal);
        }
    // TODO - After building an index for the mentions of Freebase entities in ClueWeb,
    // we need to get the ClueWeb mentions of this freebase entity and write them to a separate file
    });
}
Also used : QueryParser(org.apache.lucene.queryparser.classic.QueryParser) Query(org.apache.lucene.search.Query) CheckHits(org.apache.lucene.search.CheckHits) Literal(org.openrdf.model.Literal) IOException(java.io.IOException) Document(org.apache.lucene.document.Document) HashSet(java.util.HashSet)

Example 79 with QueryParser

use of org.apache.lucene.queryparser.classic.QueryParser in project Anserini by castorini.

the class PyseriniEntryPoint method search.

/**
 * Prints TREC submission file to the standard output stream.
 *
 * @param topics     queries
 * @param similarity similarity
 * @throws IOException
 * @throws ParseException
 */
public Map<String, Float> search(SortedMap<Integer, String> topics, Similarity similarity, int numHits, RerankerCascade cascade, boolean useQueryParser, boolean keepstopwords) throws IOException, ParseException {
    Map<String, Float> scoredDocs = new LinkedHashMap<>();
    IndexSearcher searcher = new IndexSearcher(reader);
    searcher.setSimilarity(similarity);
    EnglishAnalyzer ea = keepstopwords ? new EnglishAnalyzer(CharArraySet.EMPTY_SET) : new EnglishAnalyzer();
    QueryParser queryParser = new QueryParser(FIELD_BODY, ea);
    queryParser.setDefaultOperator(QueryParser.Operator.OR);
    for (Map.Entry<Integer, String> entry : topics.entrySet()) {
        int qID = entry.getKey();
        String queryString = entry.getValue();
        Query query = useQueryParser ? queryParser.parse(queryString) : AnalyzerUtils.buildBagOfWordsQuery(FIELD_BODY, ea, queryString);
        TopDocs rs = searcher.search(query, numHits);
        ScoreDoc[] hits = rs.scoreDocs;
        List<String> queryTokens = AnalyzerUtils.tokenize(ea, queryString);
        RerankerContext context = new RerankerContext(searcher, query, String.valueOf(qID), queryString, queryTokens, FIELD_BODY, null);
        ScoredDocuments docs = cascade.run(ScoredDocuments.fromTopDocs(rs, searcher), context);
        for (int i = 0; i < docs.documents.length; i++) {
            String docid = docs.documents[i].getField(FIELD_ID).stringValue();
            float score = docs.scores[i];
            scoredDocs.put(docid, score);
        }
    }
    return scoredDocs;
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) Query(org.apache.lucene.search.Query) ScoredDocuments(io.anserini.rerank.ScoredDocuments) EnglishAnalyzer(org.apache.lucene.analysis.en.EnglishAnalyzer) ScoreDoc(org.apache.lucene.search.ScoreDoc) TopDocs(org.apache.lucene.search.TopDocs) QueryParser(org.apache.lucene.queryparser.classic.QueryParser) RerankerContext(io.anserini.rerank.RerankerContext)

Example 80 with QueryParser

use of org.apache.lucene.queryparser.classic.QueryParser in project Anserini by castorini.

the class RetrieveSentences method search.

public Map<String, Float> search(SortedMap<Integer, String> topics, int numHits) throws IOException, ParseException {
    IndexSearcher searcher = new IndexSearcher(reader);
    // using BM25 scoring model
    Similarity similarity = new BM25Similarity(0.9f, 0.4f);
    searcher.setSimilarity(similarity);
    EnglishAnalyzer ea = new EnglishAnalyzer();
    QueryParser queryParser = new QueryParser(FIELD_BODY, ea);
    queryParser.setDefaultOperator(QueryParser.Operator.OR);
    Map<String, Float> scoredDocs = new LinkedHashMap<>();
    for (Map.Entry<Integer, String> entry : topics.entrySet()) {
        int qID = entry.getKey();
        String queryString = entry.getValue();
        Query query = AnalyzerUtils.buildBagOfWordsQuery(FIELD_BODY, ea, queryString);
        TopDocs rs = searcher.search(query, numHits);
        ScoreDoc[] hits = rs.scoreDocs;
        ScoredDocuments docs = ScoredDocuments.fromTopDocs(rs, searcher);
        for (int i = 0; i < docs.documents.length; i++) {
            scoredDocs.put(docs.documents[i].getField(FIELD_ID).stringValue(), docs.scores[i]);
        }
    }
    return scoredDocs;
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) Similarity(org.apache.lucene.search.similarities.Similarity) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity) Query(org.apache.lucene.search.Query) ScoredDocuments(io.anserini.rerank.ScoredDocuments) EnglishAnalyzer(org.apache.lucene.analysis.en.EnglishAnalyzer) ScoreDoc(org.apache.lucene.search.ScoreDoc) TopDocs(org.apache.lucene.search.TopDocs) QueryParser(org.apache.lucene.queryparser.classic.QueryParser) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity)

Aggregations

QueryParser (org.apache.lucene.queryparser.classic.QueryParser)114 Query (org.apache.lucene.search.Query)79 IndexSearcher (org.apache.lucene.search.IndexSearcher)54 Document (org.apache.lucene.document.Document)47 TopDocs (org.apache.lucene.search.TopDocs)39 ParseException (org.apache.lucene.queryparser.classic.ParseException)38 IOException (java.io.IOException)35 Analyzer (org.apache.lucene.analysis.Analyzer)34 ScoreDoc (org.apache.lucene.search.ScoreDoc)31 IndexReader (org.apache.lucene.index.IndexReader)27 ArrayList (java.util.ArrayList)25 StandardAnalyzer (org.apache.lucene.analysis.standard.StandardAnalyzer)24 BooleanQuery (org.apache.lucene.search.BooleanQuery)23 TermQuery (org.apache.lucene.search.TermQuery)19 MultiFieldQueryParser (org.apache.lucene.queryparser.classic.MultiFieldQueryParser)12 HashSet (java.util.HashSet)10 Directory (org.apache.lucene.store.Directory)10 IndexWriter (org.apache.lucene.index.IndexWriter)9 IndexableField (org.apache.lucene.index.IndexableField)8 KeywordAnalyzer (org.apache.lucene.analysis.core.KeywordAnalyzer)7