use of org.apache.lucene.queryparser.classic.QueryParser in project Anserini by castorini.
the class IndexUtils method printTermCounts.
public void printTermCounts(String termStr) throws IOException, ParseException {
EnglishAnalyzer ea = new EnglishAnalyzer(CharArraySet.EMPTY_SET);
QueryParser qp = new QueryParser(LuceneDocumentGenerator.FIELD_BODY, ea);
TermQuery q = (TermQuery) qp.parse(termStr);
Term t = q.getTerm();
System.out.println("raw term: " + termStr);
System.out.println("stemmed term: " + q.toString(LuceneDocumentGenerator.FIELD_BODY));
System.out.println("collection frequency: " + reader.totalTermFreq(t));
System.out.println("document frequency: " + reader.docFreq(t));
PostingsEnum postingsEnum = MultiFields.getTermDocsEnum(reader, LuceneDocumentGenerator.FIELD_BODY, t.bytes());
System.out.println("postings:\n");
while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
System.out.printf("\t%s, %s\n", postingsEnum.docID(), postingsEnum.freq());
}
}
use of org.apache.lucene.queryparser.classic.QueryParser in project Anserini by castorini.
the class TrainingDataGenerator method birthdate.
/**
* Generate training data for property birth date
* <p>
* Note: this function might need some refactoring when we add more properties
*/
void birthdate() throws ParseException, IOException {
QueryParser queryParser = new QueryParser(FIELD_BIRTHDATE, getKbIndexAnalyzer());
queryParser.setAllowLeadingWildcard(true);
Query q = queryParser.parse("*");
LOG.info("Starting the search using query: {}", q.toString());
// Collect all matching documents in a set of matching doc ids
Set<Integer> matchingDocIds = new HashSet<>();
getKbIndexSearcher().search(q, new CheckHits.SetCollector(matchingDocIds));
LOG.info("Found {} matching documents, retrieving...", matchingDocIds.size());
// Process the retrieved document ids
matchingDocIds.forEach((Integer docId) -> {
Document doc = null;
try {
doc = getKbIndexReader().document(docId);
} catch (IOException e) {
LOG.warn("Error retrieving document with id: {}. Ignoring.", docId);
return;
}
String freebaseURI = doc.get(ObjectTriplesLuceneDocumentGenerator.FIELD_SUBJECT);
// We might have multiple values for the field
String[] birthdates = doc.getValues(FIELD_BIRTHDATE);
// Get the freebase English label of this entity
String[] labels = doc.getValues(FIELD_LABEL);
String englishLabel = null;
for (String label : labels) {
Literal literal = NTriplesUtil.parseLiteral(label, valueFactory);
if (literal.getLanguage().orElse("N/A").toLowerCase().equals("en")) {
englishLabel = literal.stringValue();
break;
}
}
// Basically make sure label is not null, for some entities in freebase
if (englishLabel == null || freebaseURI == null || birthdates == null || birthdates.length == 0)
// Ignore this search
return;
String freebaseId = freebaseUriToFreebaseId(freebaseURI);
for (String birthdate : birthdates) {
// Get string value
String birthdateVal = extractValueFromTypedLiteralString(birthdate);
// Write property value as training data
writeToTrainingFile(TRAINING_DATA_OUTPUT_FILE_EXAMPLES, freebaseId, englishLabel, birthdateVal);
}
// TODO - After building an index for the mentions of Freebase entities in ClueWeb,
// we need to get the ClueWeb mentions of this freebase entity and write them to a separate file
});
}
use of org.apache.lucene.queryparser.classic.QueryParser in project Anserini by castorini.
the class SearchWebCollection method search.
/**
* Prints TREC submission file to the standard output stream.
*
* @param topics queries
* @param similarity similarity
* @throws IOException
* @throws ParseException
*/
public void search(SortedMap<Integer, String> topics, String submissionFile, Similarity similarity, int numHits, RerankerCascade cascade, boolean useQueryParser, boolean keepstopwords) throws IOException, ParseException {
IndexSearcher searcher = new IndexSearcher(reader);
searcher.setSimilarity(similarity);
final String runTag = "BM25_EnglishAnalyzer_" + (keepstopwords ? "KeepStopwords_" : "") + FIELD_BODY + "_" + similarity.toString();
PrintWriter out = new PrintWriter(Files.newBufferedWriter(Paths.get(submissionFile), StandardCharsets.US_ASCII));
EnglishAnalyzer ea = keepstopwords ? new EnglishAnalyzer(CharArraySet.EMPTY_SET) : new EnglishAnalyzer();
QueryParser queryParser = new QueryParser(FIELD_BODY, ea);
queryParser.setDefaultOperator(QueryParser.Operator.OR);
for (Map.Entry<Integer, String> entry : topics.entrySet()) {
int qID = entry.getKey();
String queryString = entry.getValue();
Query query = useQueryParser ? queryParser.parse(queryString) : AnalyzerUtils.buildBagOfWordsQuery(FIELD_BODY, ea, queryString);
/**
* For Web Tracks 2010,2011,and 2012; an experimental run consists of the top 10,000 documents for each topic query.
*/
TopDocs rs = searcher.search(query, numHits);
ScoreDoc[] hits = rs.scoreDocs;
List<String> queryTokens = AnalyzerUtils.tokenize(ea, queryString);
RerankerContext context = new RerankerContext(searcher, query, String.valueOf(qID), queryString, queryTokens, FIELD_BODY, null);
ScoredDocuments docs = cascade.run(ScoredDocuments.fromTopDocs(rs, searcher), context);
/**
* the first column is the topic number.
* the second column is currently unused and should always be "Q0".
* the third column is the official document identifier of the retrieved document.
* the fourth column is the rank the document is retrieved.
* the fifth column shows the score (integer or floating point) that generated the ranking.
* the sixth column is called the "run tag" and should be a unique identifier for your
*/
for (int i = 0; i < docs.documents.length; i++) {
out.println(String.format("%d Q0 %s %d %f %s", qID, docs.documents[i].getField(FIELD_ID).stringValue(), (i + 1), docs.scores[i], runTag));
}
}
out.flush();
out.close();
}
use of org.apache.lucene.queryparser.classic.QueryParser in project Anserini by castorini.
the class IdfPassageScorer method score.
@Override
public void score(String query, Map<String, Float> sentences) throws Exception {
// EnglishAnalyzer ea = new EnglishAnalyzer(StopFilter.makeStopSet(stopWords));
EnglishAnalyzer ea = new EnglishAnalyzer(CharArraySet.EMPTY_SET);
QueryParser qp = new QueryParser(LuceneDocumentGenerator.FIELD_BODY, ea);
ClassicSimilarity similarity = new ClassicSimilarity();
String escapedQuery = qp.escape(query);
Query question = qp.parse(escapedQuery);
HashSet<String> questionTerms = new HashSet<>(Arrays.asList(question.toString().trim().toLowerCase().split("\\s+")));
// add the question terms to the termIDF Map
for (String questionTerm : questionTerms) {
try {
TermQuery q = (TermQuery) qp.parse(questionTerm);
Term t = q.getTerm();
double termIDF = similarity.idf(reader.docFreq(t), reader.numDocs());
termIdfMap.put(questionTerm, String.valueOf(termIDF));
} catch (Exception e) {
continue;
}
}
// avoid duplicate passages
HashSet<String> seenSentences = new HashSet<>();
for (Map.Entry<String, Float> sent : sentences.entrySet()) {
double idf = 0.0;
HashSet<String> seenTerms = new HashSet<>();
String[] terms = sent.getKey().toLowerCase().split("\\s+");
for (String term : terms) {
try {
TermQuery q = (TermQuery) qp.parse(term);
Term t = q.getTerm();
double termIDF = similarity.idf(reader.docFreq(t), reader.numDocs());
termIdfMap.put(term, String.valueOf(termIDF));
if (questionTerms.contains(t.toString()) && !seenTerms.contains(t.toString())) {
idf += termIDF;
seenTerms.add(t.toString());
} else {
idf += 0.0;
}
} catch (Exception e) {
continue;
}
}
double weightedScore = idf + 0.0001 * sent.getValue();
ScoredPassage scoredPassage = new ScoredPassage(sent.getKey(), weightedScore, sent.getValue());
if ((scoredPassageHeap.size() < topPassages || weightedScore > scoredPassageHeap.peekLast().getScore()) && !seenSentences.contains(sent)) {
if (scoredPassageHeap.size() == topPassages) {
scoredPassageHeap.pollLast();
}
scoredPassageHeap.add(scoredPassage);
seenSentences.add(sent.getKey());
}
}
}
use of org.apache.lucene.queryparser.classic.QueryParser in project Anserini by castorini.
the class Rm3Reranker method rerank.
@Override
public ScoredDocuments rerank(ScoredDocuments docs, RerankerContext context) {
Preconditions.checkState(docs.documents.length == docs.scores.length);
IndexSearcher searcher = context.getIndexSearcher();
IndexReader reader = searcher.getIndexReader();
FeatureVector qfv = FeatureVector.fromTerms(AnalyzerUtils.tokenize(analyzer, context.getQueryText())).scaleToUnitL1Norm();
FeatureVector rm = estimateRelevanceModel(docs, reader);
LOG.info("Relevance model estimated.");
rm = FeatureVector.interpolate(qfv, rm, originalQueryWeight);
StringBuilder builder = new StringBuilder();
Iterator<String> terms = rm.iterator();
while (terms.hasNext()) {
String term = terms.next();
double prob = rm.getFeatureWeight(term);
builder.append(term + "^" + prob + " ");
}
String queryText = builder.toString().trim();
QueryParser p = new QueryParser(field, new WhitespaceAnalyzer());
Query nq = null;
try {
nq = p.parse(queryText);
} catch (ParseException e) {
e.printStackTrace();
return docs;
}
LOG.info("Running new query: " + nq);
TopDocs rs = null;
try {
if (context.getFilter() == null) {
rs = searcher.search(nq, 1000);
} else {
BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder();
bqBuilder.add(context.getFilter(), BooleanClause.Occur.FILTER);
bqBuilder.add(nq, BooleanClause.Occur.MUST);
Query q = bqBuilder.build();
rs = searcher.search(q, 1000);
}
} catch (IOException e) {
e.printStackTrace();
return docs;
}
return ScoredDocuments.fromTopDocs(rs, searcher);
}
Aggregations