use of org.apache.lucene.queryparser.classic.QueryParser in project Anserini by castorini.
the class TrainingDataGenerator method birthdate.
/**
* Generate training data for property birth date
* <p>
* Note: this function might need some refactoring when we add more properties
*/
void birthdate() throws ParseException, IOException {
QueryParser queryParser = new QueryParser(FIELD_BIRTHDATE, getKbIndexAnalyzer());
queryParser.setAllowLeadingWildcard(true);
Query q = queryParser.parse("*");
LOG.info("Starting the search using query: {}", q.toString());
// Collect all matching documents in a set of matching doc ids
Set<Integer> matchingDocIds = new HashSet<>();
getKbIndexSearcher().search(q, new CheckHits.SetCollector(matchingDocIds));
LOG.info("Found {} matching documents, retrieving...", matchingDocIds.size());
// Process the retrieved document ids
matchingDocIds.forEach((Integer docId) -> {
Document doc = null;
try {
doc = getKbIndexReader().document(docId);
} catch (IOException e) {
LOG.warn("Error retrieving document with id: {}. Ignoring.", docId);
return;
}
String freebaseURI = doc.get(IndexNodes.FIELD_ID);
// We might have multiple values for the field
String[] birthdates = doc.getValues(FIELD_BIRTHDATE);
// Get the freebase English label of this entity
String[] labels = doc.getValues(FIELD_LABEL);
String englishLabel = null;
for (String label : labels) {
Literal literal = NTriplesUtil.parseLiteral(label, valueFactory);
if (literal.getLanguage().orElse("N/A").toLowerCase().equals("en")) {
englishLabel = literal.stringValue();
break;
}
}
// Basically make sure label is not null, for some entities in freebase
if (englishLabel == null || freebaseURI == null || birthdates == null || birthdates.length == 0)
// Ignore this search
return;
String freebaseId = freebaseUriToFreebaseId(freebaseURI);
for (String birthdate : birthdates) {
// Get string value
String birthdateVal = extractValueFromTypedLiteralString(birthdate);
// Write property value as training data
writeToTrainingFile(TRAINING_DATA_OUTPUT_FILE_EXAMPLES, freebaseId, englishLabel, birthdateVal);
}
// TODO - After building an index for the mentions of Freebase entities in ClueWeb,
// we need to get the ClueWeb mentions of this freebase entity and write them to a separate file
});
}
use of org.apache.lucene.queryparser.classic.QueryParser in project Anserini by castorini.
the class PyseriniEntryPoint method search.
/**
* Prints TREC submission file to the standard output stream.
*
* @param topics queries
* @param similarity similarity
* @throws IOException
* @throws ParseException
*/
public Map<String, Float> search(SortedMap<Integer, String> topics, Similarity similarity, int numHits, RerankerCascade cascade, boolean useQueryParser, boolean keepstopwords) throws IOException, ParseException {
Map<String, Float> scoredDocs = new LinkedHashMap<>();
IndexSearcher searcher = new IndexSearcher(reader);
searcher.setSimilarity(similarity);
EnglishAnalyzer ea = keepstopwords ? new EnglishAnalyzer(CharArraySet.EMPTY_SET) : new EnglishAnalyzer();
QueryParser queryParser = new QueryParser(FIELD_BODY, ea);
queryParser.setDefaultOperator(QueryParser.Operator.OR);
for (Map.Entry<Integer, String> entry : topics.entrySet()) {
int qID = entry.getKey();
String queryString = entry.getValue();
Query query = useQueryParser ? queryParser.parse(queryString) : AnalyzerUtils.buildBagOfWordsQuery(FIELD_BODY, ea, queryString);
TopDocs rs = searcher.search(query, numHits);
ScoreDoc[] hits = rs.scoreDocs;
List<String> queryTokens = AnalyzerUtils.tokenize(ea, queryString);
RerankerContext context = new RerankerContext(searcher, query, String.valueOf(qID), queryString, queryTokens, FIELD_BODY, null);
ScoredDocuments docs = cascade.run(ScoredDocuments.fromTopDocs(rs, searcher), context);
for (int i = 0; i < docs.documents.length; i++) {
String docid = docs.documents[i].getField(FIELD_ID).stringValue();
float score = docs.scores[i];
scoredDocs.put(docid, score);
}
}
return scoredDocs;
}
use of org.apache.lucene.queryparser.classic.QueryParser in project Anserini by castorini.
the class RetrieveSentences method search.
public Map<String, Float> search(SortedMap<Integer, String> topics, int numHits) throws IOException, ParseException {
IndexSearcher searcher = new IndexSearcher(reader);
// using BM25 scoring model
Similarity similarity = new BM25Similarity(0.9f, 0.4f);
searcher.setSimilarity(similarity);
EnglishAnalyzer ea = new EnglishAnalyzer();
QueryParser queryParser = new QueryParser(FIELD_BODY, ea);
queryParser.setDefaultOperator(QueryParser.Operator.OR);
Map<String, Float> scoredDocs = new LinkedHashMap<>();
for (Map.Entry<Integer, String> entry : topics.entrySet()) {
int qID = entry.getKey();
String queryString = entry.getValue();
Query query = AnalyzerUtils.buildBagOfWordsQuery(FIELD_BODY, ea, queryString);
TopDocs rs = searcher.search(query, numHits);
ScoreDoc[] hits = rs.scoreDocs;
ScoredDocuments docs = ScoredDocuments.fromTopDocs(rs, searcher);
for (int i = 0; i < docs.documents.length; i++) {
scoredDocs.put(docs.documents[i].getField(FIELD_ID).stringValue(), docs.scores[i]);
}
}
return scoredDocs;
}
use of org.apache.lucene.queryparser.classic.QueryParser in project Anserini by castorini.
the class IdfPassageScorer method score.
@Override
public void score(String query, Map<String, Float> sentences) throws Exception {
EnglishAnalyzer englishAnalyzer = new EnglishAnalyzer(StopFilter.makeStopSet(stopWords));
QueryParser queryParser = new QueryParser(LuceneDocumentGenerator.FIELD_BODY, englishAnalyzer);
ClassicSimilarity similarity = new ClassicSimilarity();
String escapedQuery = queryParser.escape(query);
Query question = queryParser.parse(escapedQuery);
HashSet<String> questionTerms = new HashSet<>(Arrays.asList(question.toString().trim().toLowerCase().split("\\s+")));
EnglishAnalyzer englishAnalyzerWithStop = new EnglishAnalyzer(CharArraySet.EMPTY_SET);
QueryParser queryParserWithStop = new QueryParser(LuceneDocumentGenerator.FIELD_BODY, englishAnalyzerWithStop);
Query questionWithStopWords = queryParserWithStop.parse(escapedQuery);
HashSet<String> questionTermsIDF = new HashSet<>(Arrays.asList(questionWithStopWords.toString().trim().toLowerCase().split("\\s+")));
// add the question terms to the termIDF Map
for (String questionTerm : questionTermsIDF) {
try {
TermQuery q = (TermQuery) queryParserWithStop.parse(questionTerm);
Term t = q.getTerm();
double termIDF = similarity.idf(reader.docFreq(t), reader.numDocs());
termIdfMap.put(questionTerm, String.valueOf(termIDF));
} catch (Exception e) {
continue;
}
}
// avoid duplicate passages
HashSet<String> seenSentences = new HashSet<>();
for (Map.Entry<String, Float> sent : sentences.entrySet()) {
double idf = 0.0;
HashSet<String> seenTerms = new HashSet<>();
String[] terms = sent.getKey().toLowerCase().split("\\s+");
for (String term : terms) {
try {
TermQuery q = (TermQuery) queryParser.parse(term);
Term t = q.getTerm();
double termIDF = similarity.idf(reader.docFreq(t), reader.numDocs());
if (questionTerms.contains(t.toString()) && !seenTerms.contains(t.toString())) {
idf += termIDF;
seenTerms.add(t.toString());
}
TermQuery q2 = (TermQuery) queryParserWithStop.parse(term);
Term t2 = q2.getTerm();
double termIDFwithStop = similarity.idf(reader.docFreq(t2), reader.numDocs());
termIdfMap.put(term, String.valueOf(termIDFwithStop));
} catch (Exception e) {
continue;
}
}
double weightedScore = idf + 0.0001 * sent.getValue();
ScoredPassage scoredPassage = new ScoredPassage(sent.getKey(), weightedScore, sent.getValue());
if ((scoredPassageHeap.size() < topPassages || weightedScore > scoredPassageHeap.peekLast().getScore()) && !seenSentences.contains(sent)) {
if (scoredPassageHeap.size() == topPassages) {
scoredPassageHeap.pollLast();
}
scoredPassageHeap.add(scoredPassage);
seenSentences.add(sent.getKey());
}
}
}
use of org.apache.lucene.queryparser.classic.QueryParser in project Anserini by castorini.
the class SearchWebCollection method search.
/**
* Prints TREC submission file to the standard output stream.
*
* @param topics queries
* @param similarity similarity
* @throws IOException
* @throws ParseException
*/
public void search(SortedMap<Integer, String> topics, String submissionFile, Similarity similarity, int numHits, RerankerCascade cascade, boolean useQueryParser, boolean keepstopwords) throws IOException, ParseException {
IndexSearcher searcher = new IndexSearcher(reader);
searcher.setSimilarity(similarity);
final String runTag = "BM25_EnglishAnalyzer_" + (keepstopwords ? "KeepStopwords_" : "") + FIELD_BODY + "_" + similarity.toString();
PrintWriter out = new PrintWriter(Files.newBufferedWriter(Paths.get(submissionFile), StandardCharsets.US_ASCII));
EnglishAnalyzer ea = keepstopwords ? new EnglishAnalyzer(CharArraySet.EMPTY_SET) : new EnglishAnalyzer();
QueryParser queryParser = new QueryParser(FIELD_BODY, ea);
queryParser.setDefaultOperator(QueryParser.Operator.OR);
for (Map.Entry<Integer, String> entry : topics.entrySet()) {
int qID = entry.getKey();
String queryString = entry.getValue();
Query query = useQueryParser ? queryParser.parse(queryString) : AnalyzerUtils.buildBagOfWordsQuery(FIELD_BODY, ea, queryString);
/**
* For Web Tracks 2010,2011,and 2012; an experimental run consists of the top 10,000 documents for each topic query.
*/
TopDocs rs = searcher.search(query, numHits);
ScoreDoc[] hits = rs.scoreDocs;
List<String> queryTokens = AnalyzerUtils.tokenize(ea, queryString);
RerankerContext context = new RerankerContext(searcher, query, String.valueOf(qID), queryString, queryTokens, FIELD_BODY, null);
ScoredDocuments docs = cascade.run(ScoredDocuments.fromTopDocs(rs, searcher), context);
/**
* the first column is the topic number.
* the second column is currently unused and should always be "Q0".
* the third column is the official document identifier of the retrieved document.
* the fourth column is the rank the document is retrieved.
* the fifth column shows the score (integer or floating point) that generated the ranking.
* the sixth column is called the "run tag" and should be a unique identifier for your
*/
for (int i = 0; i < docs.documents.length; i++) {
out.println(String.format("%d Q0 %s %d %f %s", qID, docs.documents[i].getField(FIELD_ID).stringValue(), (i + 1), docs.scores[i], runTag));
}
}
out.flush();
out.close();
}
Aggregations