Search in sources :

Example 41 with QueryParser

use of org.apache.lucene.queryparser.classic.QueryParser in project Anserini by castorini.

the class TrainingDataGenerator method birthdate.

/**
 * Generate training data for property birth date
 * <p>
 * Note: this function might need some refactoring when we add more properties
 */
void birthdate() throws ParseException, IOException {
    QueryParser queryParser = new QueryParser(FIELD_BIRTHDATE, getKbIndexAnalyzer());
    queryParser.setAllowLeadingWildcard(true);
    Query q = queryParser.parse("*");
    LOG.info("Starting the search using query: {}", q.toString());
    // Collect all matching documents in a set of matching doc ids
    Set<Integer> matchingDocIds = new HashSet<>();
    getKbIndexSearcher().search(q, new CheckHits.SetCollector(matchingDocIds));
    LOG.info("Found {} matching documents, retrieving...", matchingDocIds.size());
    // Process the retrieved document ids
    matchingDocIds.forEach((Integer docId) -> {
        Document doc = null;
        try {
            doc = getKbIndexReader().document(docId);
        } catch (IOException e) {
            LOG.warn("Error retrieving document with id: {}. Ignoring.", docId);
            return;
        }
        String freebaseURI = doc.get(IndexNodes.FIELD_ID);
        // We might have multiple values for the field
        String[] birthdates = doc.getValues(FIELD_BIRTHDATE);
        // Get the freebase English label of this entity
        String[] labels = doc.getValues(FIELD_LABEL);
        String englishLabel = null;
        for (String label : labels) {
            Literal literal = NTriplesUtil.parseLiteral(label, valueFactory);
            if (literal.getLanguage().orElse("N/A").toLowerCase().equals("en")) {
                englishLabel = literal.stringValue();
                break;
            }
        }
        // Basically make sure label is not null, for some entities in freebase
        if (englishLabel == null || freebaseURI == null || birthdates == null || birthdates.length == 0)
            // Ignore this search
            return;
        String freebaseId = freebaseUriToFreebaseId(freebaseURI);
        for (String birthdate : birthdates) {
            // Get string value
            String birthdateVal = extractValueFromTypedLiteralString(birthdate);
            // Write property value as training data
            writeToTrainingFile(TRAINING_DATA_OUTPUT_FILE_EXAMPLES, freebaseId, englishLabel, birthdateVal);
        }
    // TODO - After building an index for the mentions of Freebase entities in ClueWeb,
    // we need to get the ClueWeb mentions of this freebase entity and write them to a separate file
    });
}
Also used : QueryParser(org.apache.lucene.queryparser.classic.QueryParser) Query(org.apache.lucene.search.Query) CheckHits(org.apache.lucene.search.CheckHits) Literal(org.openrdf.model.Literal) IOException(java.io.IOException) Document(org.apache.lucene.document.Document) HashSet(java.util.HashSet)

Example 42 with QueryParser

use of org.apache.lucene.queryparser.classic.QueryParser in project Anserini by castorini.

the class PyseriniEntryPoint method search.

/**
 * Prints TREC submission file to the standard output stream.
 *
 * @param topics     queries
 * @param similarity similarity
 * @throws IOException
 * @throws ParseException
 */
public Map<String, Float> search(SortedMap<Integer, String> topics, Similarity similarity, int numHits, RerankerCascade cascade, boolean useQueryParser, boolean keepstopwords) throws IOException, ParseException {
    Map<String, Float> scoredDocs = new LinkedHashMap<>();
    IndexSearcher searcher = new IndexSearcher(reader);
    searcher.setSimilarity(similarity);
    EnglishAnalyzer ea = keepstopwords ? new EnglishAnalyzer(CharArraySet.EMPTY_SET) : new EnglishAnalyzer();
    QueryParser queryParser = new QueryParser(FIELD_BODY, ea);
    queryParser.setDefaultOperator(QueryParser.Operator.OR);
    for (Map.Entry<Integer, String> entry : topics.entrySet()) {
        int qID = entry.getKey();
        String queryString = entry.getValue();
        Query query = useQueryParser ? queryParser.parse(queryString) : AnalyzerUtils.buildBagOfWordsQuery(FIELD_BODY, ea, queryString);
        TopDocs rs = searcher.search(query, numHits);
        ScoreDoc[] hits = rs.scoreDocs;
        List<String> queryTokens = AnalyzerUtils.tokenize(ea, queryString);
        RerankerContext context = new RerankerContext(searcher, query, String.valueOf(qID), queryString, queryTokens, FIELD_BODY, null);
        ScoredDocuments docs = cascade.run(ScoredDocuments.fromTopDocs(rs, searcher), context);
        for (int i = 0; i < docs.documents.length; i++) {
            String docid = docs.documents[i].getField(FIELD_ID).stringValue();
            float score = docs.scores[i];
            scoredDocs.put(docid, score);
        }
    }
    return scoredDocs;
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) Query(org.apache.lucene.search.Query) ScoredDocuments(io.anserini.rerank.ScoredDocuments) EnglishAnalyzer(org.apache.lucene.analysis.en.EnglishAnalyzer) ScoreDoc(org.apache.lucene.search.ScoreDoc) TopDocs(org.apache.lucene.search.TopDocs) QueryParser(org.apache.lucene.queryparser.classic.QueryParser) RerankerContext(io.anserini.rerank.RerankerContext)

Example 43 with QueryParser

use of org.apache.lucene.queryparser.classic.QueryParser in project Anserini by castorini.

the class RetrieveSentences method search.

public Map<String, Float> search(SortedMap<Integer, String> topics, int numHits) throws IOException, ParseException {
    IndexSearcher searcher = new IndexSearcher(reader);
    // using BM25 scoring model
    Similarity similarity = new BM25Similarity(0.9f, 0.4f);
    searcher.setSimilarity(similarity);
    EnglishAnalyzer ea = new EnglishAnalyzer();
    QueryParser queryParser = new QueryParser(FIELD_BODY, ea);
    queryParser.setDefaultOperator(QueryParser.Operator.OR);
    Map<String, Float> scoredDocs = new LinkedHashMap<>();
    for (Map.Entry<Integer, String> entry : topics.entrySet()) {
        int qID = entry.getKey();
        String queryString = entry.getValue();
        Query query = AnalyzerUtils.buildBagOfWordsQuery(FIELD_BODY, ea, queryString);
        TopDocs rs = searcher.search(query, numHits);
        ScoreDoc[] hits = rs.scoreDocs;
        ScoredDocuments docs = ScoredDocuments.fromTopDocs(rs, searcher);
        for (int i = 0; i < docs.documents.length; i++) {
            scoredDocs.put(docs.documents[i].getField(FIELD_ID).stringValue(), docs.scores[i]);
        }
    }
    return scoredDocs;
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) Similarity(org.apache.lucene.search.similarities.Similarity) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity) Query(org.apache.lucene.search.Query) ScoredDocuments(io.anserini.rerank.ScoredDocuments) EnglishAnalyzer(org.apache.lucene.analysis.en.EnglishAnalyzer) ScoreDoc(org.apache.lucene.search.ScoreDoc) TopDocs(org.apache.lucene.search.TopDocs) QueryParser(org.apache.lucene.queryparser.classic.QueryParser) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity)

Example 44 with QueryParser

use of org.apache.lucene.queryparser.classic.QueryParser in project Anserini by castorini.

the class IdfPassageScorer method score.

@Override
public void score(String query, Map<String, Float> sentences) throws Exception {
    EnglishAnalyzer englishAnalyzer = new EnglishAnalyzer(StopFilter.makeStopSet(stopWords));
    QueryParser queryParser = new QueryParser(LuceneDocumentGenerator.FIELD_BODY, englishAnalyzer);
    ClassicSimilarity similarity = new ClassicSimilarity();
    String escapedQuery = queryParser.escape(query);
    Query question = queryParser.parse(escapedQuery);
    HashSet<String> questionTerms = new HashSet<>(Arrays.asList(question.toString().trim().toLowerCase().split("\\s+")));
    EnglishAnalyzer englishAnalyzerWithStop = new EnglishAnalyzer(CharArraySet.EMPTY_SET);
    QueryParser queryParserWithStop = new QueryParser(LuceneDocumentGenerator.FIELD_BODY, englishAnalyzerWithStop);
    Query questionWithStopWords = queryParserWithStop.parse(escapedQuery);
    HashSet<String> questionTermsIDF = new HashSet<>(Arrays.asList(questionWithStopWords.toString().trim().toLowerCase().split("\\s+")));
    // add the question terms to the termIDF Map
    for (String questionTerm : questionTermsIDF) {
        try {
            TermQuery q = (TermQuery) queryParserWithStop.parse(questionTerm);
            Term t = q.getTerm();
            double termIDF = similarity.idf(reader.docFreq(t), reader.numDocs());
            termIdfMap.put(questionTerm, String.valueOf(termIDF));
        } catch (Exception e) {
            continue;
        }
    }
    // avoid duplicate passages
    HashSet<String> seenSentences = new HashSet<>();
    for (Map.Entry<String, Float> sent : sentences.entrySet()) {
        double idf = 0.0;
        HashSet<String> seenTerms = new HashSet<>();
        String[] terms = sent.getKey().toLowerCase().split("\\s+");
        for (String term : terms) {
            try {
                TermQuery q = (TermQuery) queryParser.parse(term);
                Term t = q.getTerm();
                double termIDF = similarity.idf(reader.docFreq(t), reader.numDocs());
                if (questionTerms.contains(t.toString()) && !seenTerms.contains(t.toString())) {
                    idf += termIDF;
                    seenTerms.add(t.toString());
                }
                TermQuery q2 = (TermQuery) queryParserWithStop.parse(term);
                Term t2 = q2.getTerm();
                double termIDFwithStop = similarity.idf(reader.docFreq(t2), reader.numDocs());
                termIdfMap.put(term, String.valueOf(termIDFwithStop));
            } catch (Exception e) {
                continue;
            }
        }
        double weightedScore = idf + 0.0001 * sent.getValue();
        ScoredPassage scoredPassage = new ScoredPassage(sent.getKey(), weightedScore, sent.getValue());
        if ((scoredPassageHeap.size() < topPassages || weightedScore > scoredPassageHeap.peekLast().getScore()) && !seenSentences.contains(sent)) {
            if (scoredPassageHeap.size() == topPassages) {
                scoredPassageHeap.pollLast();
            }
            scoredPassageHeap.add(scoredPassage);
            seenSentences.add(sent.getKey());
        }
    }
}
Also used : ClassicSimilarity(org.apache.lucene.search.similarities.ClassicSimilarity) TermQuery(org.apache.lucene.search.TermQuery) Query(org.apache.lucene.search.Query) TermQuery(org.apache.lucene.search.TermQuery) EnglishAnalyzer(org.apache.lucene.analysis.en.EnglishAnalyzer) Term(org.apache.lucene.index.Term) IOException(java.io.IOException) QueryParser(org.apache.lucene.queryparser.classic.QueryParser) HashMap(java.util.HashMap) Map(java.util.Map) HashSet(java.util.HashSet)

Example 45 with QueryParser

use of org.apache.lucene.queryparser.classic.QueryParser in project Anserini by castorini.

the class SearchWebCollection method search.

/**
 * Prints TREC submission file to the standard output stream.
 *
 * @param topics     queries
 * @param similarity similarity
 * @throws IOException
 * @throws ParseException
 */
public void search(SortedMap<Integer, String> topics, String submissionFile, Similarity similarity, int numHits, RerankerCascade cascade, boolean useQueryParser, boolean keepstopwords) throws IOException, ParseException {
    IndexSearcher searcher = new IndexSearcher(reader);
    searcher.setSimilarity(similarity);
    final String runTag = "BM25_EnglishAnalyzer_" + (keepstopwords ? "KeepStopwords_" : "") + FIELD_BODY + "_" + similarity.toString();
    PrintWriter out = new PrintWriter(Files.newBufferedWriter(Paths.get(submissionFile), StandardCharsets.US_ASCII));
    EnglishAnalyzer ea = keepstopwords ? new EnglishAnalyzer(CharArraySet.EMPTY_SET) : new EnglishAnalyzer();
    QueryParser queryParser = new QueryParser(FIELD_BODY, ea);
    queryParser.setDefaultOperator(QueryParser.Operator.OR);
    for (Map.Entry<Integer, String> entry : topics.entrySet()) {
        int qID = entry.getKey();
        String queryString = entry.getValue();
        Query query = useQueryParser ? queryParser.parse(queryString) : AnalyzerUtils.buildBagOfWordsQuery(FIELD_BODY, ea, queryString);
        /**
         * For Web Tracks 2010,2011,and 2012; an experimental run consists of the top 10,000 documents for each topic query.
         */
        TopDocs rs = searcher.search(query, numHits);
        ScoreDoc[] hits = rs.scoreDocs;
        List<String> queryTokens = AnalyzerUtils.tokenize(ea, queryString);
        RerankerContext context = new RerankerContext(searcher, query, String.valueOf(qID), queryString, queryTokens, FIELD_BODY, null);
        ScoredDocuments docs = cascade.run(ScoredDocuments.fromTopDocs(rs, searcher), context);
        /**
         * the first column is the topic number.
         * the second column is currently unused and should always be "Q0".
         * the third column is the official document identifier of the retrieved document.
         * the fourth column is the rank the document is retrieved.
         * the fifth column shows the score (integer or floating point) that generated the ranking.
         * the sixth column is called the "run tag" and should be a unique identifier for your
         */
        for (int i = 0; i < docs.documents.length; i++) {
            out.println(String.format("%d Q0 %s %d %f %s", qID, docs.documents[i].getField(FIELD_ID).stringValue(), (i + 1), docs.scores[i], runTag));
        }
    }
    out.flush();
    out.close();
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) Query(org.apache.lucene.search.Query) ScoredDocuments(io.anserini.rerank.ScoredDocuments) EnglishAnalyzer(org.apache.lucene.analysis.en.EnglishAnalyzer) ScoreDoc(org.apache.lucene.search.ScoreDoc) TopDocs(org.apache.lucene.search.TopDocs) QueryParser(org.apache.lucene.queryparser.classic.QueryParser) Map(java.util.Map) SortedMap(java.util.SortedMap) RerankerContext(io.anserini.rerank.RerankerContext) PrintWriter(java.io.PrintWriter)

Aggregations

QueryParser (org.apache.lucene.queryparser.classic.QueryParser)73 Query (org.apache.lucene.search.Query)50 IndexSearcher (org.apache.lucene.search.IndexSearcher)32 Document (org.apache.lucene.document.Document)26 IOException (java.io.IOException)24 Analyzer (org.apache.lucene.analysis.Analyzer)21 TopDocs (org.apache.lucene.search.TopDocs)21 IndexReader (org.apache.lucene.index.IndexReader)18 ScoreDoc (org.apache.lucene.search.ScoreDoc)18 ArrayList (java.util.ArrayList)16 ParseException (org.apache.lucene.queryparser.classic.ParseException)16 StandardAnalyzer (org.apache.lucene.analysis.standard.StandardAnalyzer)14 BooleanQuery (org.apache.lucene.search.BooleanQuery)14 TermQuery (org.apache.lucene.search.TermQuery)13 ScoredDocuments (io.anserini.rerank.ScoredDocuments)6 Term (org.apache.lucene.index.Term)6 MatchAllDocsQuery (org.apache.lucene.search.MatchAllDocsQuery)6 WildcardQuery (org.apache.lucene.search.WildcardQuery)6 EnglishAnalyzer (org.apache.lucene.analysis.en.EnglishAnalyzer)5 IndexWriter (org.apache.lucene.index.IndexWriter)5