Search in sources :

Example 61 with QueryParser

use of org.apache.lucene.queryparser.classic.QueryParser in project Anserini by castorini.

the class IndexUtils method printTermCounts.

public void printTermCounts(String termStr) throws IOException, ParseException {
    EnglishAnalyzer ea = new EnglishAnalyzer(CharArraySet.EMPTY_SET);
    QueryParser qp = new QueryParser(LuceneDocumentGenerator.FIELD_BODY, ea);
    TermQuery q = (TermQuery) qp.parse(termStr);
    Term t = q.getTerm();
    System.out.println("raw term:             " + termStr);
    System.out.println("stemmed term:         " + q.toString(LuceneDocumentGenerator.FIELD_BODY));
    System.out.println("collection frequency: " + reader.totalTermFreq(t));
    System.out.println("document frequency:   " + reader.docFreq(t));
    PostingsEnum postingsEnum = MultiFields.getTermDocsEnum(reader, LuceneDocumentGenerator.FIELD_BODY, t.bytes());
    System.out.println("postings:\n");
    while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
        System.out.printf("\t%s, %s\n", postingsEnum.docID(), postingsEnum.freq());
    }
}
Also used : QueryParser(org.apache.lucene.queryparser.classic.QueryParser) EnglishAnalyzer(org.apache.lucene.analysis.en.EnglishAnalyzer)

Example 62 with QueryParser

use of org.apache.lucene.queryparser.classic.QueryParser in project Anserini by castorini.

the class TrainingDataGenerator method birthdate.

/**
   * Generate training data for property birth date
   * <p>
   * Note: this function might need some refactoring when we add more properties
   */
void birthdate() throws ParseException, IOException {
    QueryParser queryParser = new QueryParser(FIELD_BIRTHDATE, getKbIndexAnalyzer());
    queryParser.setAllowLeadingWildcard(true);
    Query q = queryParser.parse("*");
    LOG.info("Starting the search using query: {}", q.toString());
    // Collect all matching documents in a set of matching doc ids
    Set<Integer> matchingDocIds = new HashSet<>();
    getKbIndexSearcher().search(q, new CheckHits.SetCollector(matchingDocIds));
    LOG.info("Found {} matching documents, retrieving...", matchingDocIds.size());
    // Process the retrieved document ids
    matchingDocIds.forEach((Integer docId) -> {
        Document doc = null;
        try {
            doc = getKbIndexReader().document(docId);
        } catch (IOException e) {
            LOG.warn("Error retrieving document with id: {}. Ignoring.", docId);
            return;
        }
        String freebaseURI = doc.get(ObjectTriplesLuceneDocumentGenerator.FIELD_SUBJECT);
        // We might have multiple values for the field
        String[] birthdates = doc.getValues(FIELD_BIRTHDATE);
        // Get the freebase English label of this entity
        String[] labels = doc.getValues(FIELD_LABEL);
        String englishLabel = null;
        for (String label : labels) {
            Literal literal = NTriplesUtil.parseLiteral(label, valueFactory);
            if (literal.getLanguage().orElse("N/A").toLowerCase().equals("en")) {
                englishLabel = literal.stringValue();
                break;
            }
        }
        // Basically make sure label is not null, for some entities in freebase
        if (englishLabel == null || freebaseURI == null || birthdates == null || birthdates.length == 0)
            // Ignore this search
            return;
        String freebaseId = freebaseUriToFreebaseId(freebaseURI);
        for (String birthdate : birthdates) {
            // Get string value
            String birthdateVal = extractValueFromTypedLiteralString(birthdate);
            // Write property value as training data
            writeToTrainingFile(TRAINING_DATA_OUTPUT_FILE_EXAMPLES, freebaseId, englishLabel, birthdateVal);
        }
    // TODO - After building an index for the mentions of Freebase entities in ClueWeb,
    // we need to get the ClueWeb mentions of this freebase entity and write them to a separate file
    });
}
Also used : QueryParser(org.apache.lucene.queryparser.classic.QueryParser) Query(org.apache.lucene.search.Query) CheckHits(org.apache.lucene.search.CheckHits) Literal(org.openrdf.model.Literal) IOException(java.io.IOException) Document(org.apache.lucene.document.Document) HashSet(java.util.HashSet)

Example 63 with QueryParser

use of org.apache.lucene.queryparser.classic.QueryParser in project Anserini by castorini.

the class SearchWebCollection method search.

/**
   * Prints TREC submission file to the standard output stream.
   *
   * @param topics     queries
   * @param similarity similarity
   * @throws IOException
   * @throws ParseException
   */
public void search(SortedMap<Integer, String> topics, String submissionFile, Similarity similarity, int numHits, RerankerCascade cascade, boolean useQueryParser, boolean keepstopwords) throws IOException, ParseException {
    IndexSearcher searcher = new IndexSearcher(reader);
    searcher.setSimilarity(similarity);
    final String runTag = "BM25_EnglishAnalyzer_" + (keepstopwords ? "KeepStopwords_" : "") + FIELD_BODY + "_" + similarity.toString();
    PrintWriter out = new PrintWriter(Files.newBufferedWriter(Paths.get(submissionFile), StandardCharsets.US_ASCII));
    EnglishAnalyzer ea = keepstopwords ? new EnglishAnalyzer(CharArraySet.EMPTY_SET) : new EnglishAnalyzer();
    QueryParser queryParser = new QueryParser(FIELD_BODY, ea);
    queryParser.setDefaultOperator(QueryParser.Operator.OR);
    for (Map.Entry<Integer, String> entry : topics.entrySet()) {
        int qID = entry.getKey();
        String queryString = entry.getValue();
        Query query = useQueryParser ? queryParser.parse(queryString) : AnalyzerUtils.buildBagOfWordsQuery(FIELD_BODY, ea, queryString);
        /**
       * For Web Tracks 2010,2011,and 2012; an experimental run consists of the top 10,000 documents for each topic query.
       */
        TopDocs rs = searcher.search(query, numHits);
        ScoreDoc[] hits = rs.scoreDocs;
        List<String> queryTokens = AnalyzerUtils.tokenize(ea, queryString);
        RerankerContext context = new RerankerContext(searcher, query, String.valueOf(qID), queryString, queryTokens, FIELD_BODY, null);
        ScoredDocuments docs = cascade.run(ScoredDocuments.fromTopDocs(rs, searcher), context);
        /**
       * the first column is the topic number.
       * the second column is currently unused and should always be "Q0".
       * the third column is the official document identifier of the retrieved document.
       * the fourth column is the rank the document is retrieved.
       * the fifth column shows the score (integer or floating point) that generated the ranking.
       * the sixth column is called the "run tag" and should be a unique identifier for your
       */
        for (int i = 0; i < docs.documents.length; i++) {
            out.println(String.format("%d Q0 %s %d %f %s", qID, docs.documents[i].getField(FIELD_ID).stringValue(), (i + 1), docs.scores[i], runTag));
        }
    }
    out.flush();
    out.close();
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) Query(org.apache.lucene.search.Query) ScoredDocuments(io.anserini.rerank.ScoredDocuments) EnglishAnalyzer(org.apache.lucene.analysis.en.EnglishAnalyzer) ScoreDoc(org.apache.lucene.search.ScoreDoc) TopDocs(org.apache.lucene.search.TopDocs) QueryParser(org.apache.lucene.queryparser.classic.QueryParser) Map(java.util.Map) SortedMap(java.util.SortedMap) RerankerContext(io.anserini.rerank.RerankerContext) PrintWriter(java.io.PrintWriter)

Example 64 with QueryParser

use of org.apache.lucene.queryparser.classic.QueryParser in project Anserini by castorini.

the class IdfPassageScorer method score.

@Override
public void score(String query, Map<String, Float> sentences) throws Exception {
    //    EnglishAnalyzer ea = new EnglishAnalyzer(StopFilter.makeStopSet(stopWords));
    EnglishAnalyzer ea = new EnglishAnalyzer(CharArraySet.EMPTY_SET);
    QueryParser qp = new QueryParser(LuceneDocumentGenerator.FIELD_BODY, ea);
    ClassicSimilarity similarity = new ClassicSimilarity();
    String escapedQuery = qp.escape(query);
    Query question = qp.parse(escapedQuery);
    HashSet<String> questionTerms = new HashSet<>(Arrays.asList(question.toString().trim().toLowerCase().split("\\s+")));
    // add the question terms to the termIDF Map
    for (String questionTerm : questionTerms) {
        try {
            TermQuery q = (TermQuery) qp.parse(questionTerm);
            Term t = q.getTerm();
            double termIDF = similarity.idf(reader.docFreq(t), reader.numDocs());
            termIdfMap.put(questionTerm, String.valueOf(termIDF));
        } catch (Exception e) {
            continue;
        }
    }
    // avoid duplicate passages
    HashSet<String> seenSentences = new HashSet<>();
    for (Map.Entry<String, Float> sent : sentences.entrySet()) {
        double idf = 0.0;
        HashSet<String> seenTerms = new HashSet<>();
        String[] terms = sent.getKey().toLowerCase().split("\\s+");
        for (String term : terms) {
            try {
                TermQuery q = (TermQuery) qp.parse(term);
                Term t = q.getTerm();
                double termIDF = similarity.idf(reader.docFreq(t), reader.numDocs());
                termIdfMap.put(term, String.valueOf(termIDF));
                if (questionTerms.contains(t.toString()) && !seenTerms.contains(t.toString())) {
                    idf += termIDF;
                    seenTerms.add(t.toString());
                } else {
                    idf += 0.0;
                }
            } catch (Exception e) {
                continue;
            }
        }
        double weightedScore = idf + 0.0001 * sent.getValue();
        ScoredPassage scoredPassage = new ScoredPassage(sent.getKey(), weightedScore, sent.getValue());
        if ((scoredPassageHeap.size() < topPassages || weightedScore > scoredPassageHeap.peekLast().getScore()) && !seenSentences.contains(sent)) {
            if (scoredPassageHeap.size() == topPassages) {
                scoredPassageHeap.pollLast();
            }
            scoredPassageHeap.add(scoredPassage);
            seenSentences.add(sent.getKey());
        }
    }
}
Also used : ClassicSimilarity(org.apache.lucene.search.similarities.ClassicSimilarity) TermQuery(org.apache.lucene.search.TermQuery) Query(org.apache.lucene.search.Query) TermQuery(org.apache.lucene.search.TermQuery) EnglishAnalyzer(org.apache.lucene.analysis.en.EnglishAnalyzer) Term(org.apache.lucene.index.Term) QueryParser(org.apache.lucene.queryparser.classic.QueryParser)

Example 65 with QueryParser

use of org.apache.lucene.queryparser.classic.QueryParser in project Anserini by castorini.

the class Rm3Reranker method rerank.

@Override
public ScoredDocuments rerank(ScoredDocuments docs, RerankerContext context) {
    Preconditions.checkState(docs.documents.length == docs.scores.length);
    IndexSearcher searcher = context.getIndexSearcher();
    IndexReader reader = searcher.getIndexReader();
    FeatureVector qfv = FeatureVector.fromTerms(AnalyzerUtils.tokenize(analyzer, context.getQueryText())).scaleToUnitL1Norm();
    FeatureVector rm = estimateRelevanceModel(docs, reader);
    LOG.info("Relevance model estimated.");
    rm = FeatureVector.interpolate(qfv, rm, originalQueryWeight);
    StringBuilder builder = new StringBuilder();
    Iterator<String> terms = rm.iterator();
    while (terms.hasNext()) {
        String term = terms.next();
        double prob = rm.getFeatureWeight(term);
        builder.append(term + "^" + prob + " ");
    }
    String queryText = builder.toString().trim();
    QueryParser p = new QueryParser(field, new WhitespaceAnalyzer());
    Query nq = null;
    try {
        nq = p.parse(queryText);
    } catch (ParseException e) {
        e.printStackTrace();
        return docs;
    }
    LOG.info("Running new query: " + nq);
    TopDocs rs = null;
    try {
        if (context.getFilter() == null) {
            rs = searcher.search(nq, 1000);
        } else {
            BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder();
            bqBuilder.add(context.getFilter(), BooleanClause.Occur.FILTER);
            bqBuilder.add(nq, BooleanClause.Occur.MUST);
            Query q = bqBuilder.build();
            rs = searcher.search(q, 1000);
        }
    } catch (IOException e) {
        e.printStackTrace();
        return docs;
    }
    return ScoredDocuments.fromTopDocs(rs, searcher);
}
Also used : FeatureVector(io.anserini.util.FeatureVector) WhitespaceAnalyzer(org.apache.lucene.analysis.core.WhitespaceAnalyzer) IOException(java.io.IOException) QueryParser(org.apache.lucene.queryparser.classic.QueryParser) IndexReader(org.apache.lucene.index.IndexReader) ParseException(org.apache.lucene.queryparser.classic.ParseException)

Aggregations

QueryParser (org.apache.lucene.queryparser.classic.QueryParser)67 Query (org.apache.lucene.search.Query)46 IndexSearcher (org.apache.lucene.search.IndexSearcher)30 Document (org.apache.lucene.document.Document)25 IOException (java.io.IOException)19 Analyzer (org.apache.lucene.analysis.Analyzer)19 IndexReader (org.apache.lucene.index.IndexReader)18 TopDocs (org.apache.lucene.search.TopDocs)18 ScoreDoc (org.apache.lucene.search.ScoreDoc)17 ArrayList (java.util.ArrayList)14 BooleanQuery (org.apache.lucene.search.BooleanQuery)14 StandardAnalyzer (org.apache.lucene.analysis.standard.StandardAnalyzer)13 ParseException (org.apache.lucene.queryparser.classic.ParseException)12 TermQuery (org.apache.lucene.search.TermQuery)11 Term (org.apache.lucene.index.Term)6 MatchAllDocsQuery (org.apache.lucene.search.MatchAllDocsQuery)6 WildcardQuery (org.apache.lucene.search.WildcardQuery)6 EnglishAnalyzer (org.apache.lucene.analysis.en.EnglishAnalyzer)5 IndexWriter (org.apache.lucene.index.IndexWriter)5 ScoredDocuments (io.anserini.rerank.ScoredDocuments)4