use of org.apache.lucene.queryparser.classic.QueryParser in project Anserini by castorini.
the class Rm3Reranker method rerank.
@Override
public ScoredDocuments rerank(ScoredDocuments docs, RerankerContext context) {
Preconditions.checkState(docs.documents.length == docs.scores.length);
IndexSearcher searcher = context.getIndexSearcher();
IndexReader reader = searcher.getIndexReader();
FeatureVector qfv = FeatureVector.fromTerms(AnalyzerUtils.tokenize(analyzer, context.getQueryText())).scaleToUnitL1Norm();
FeatureVector rm = estimateRelevanceModel(docs, reader);
LOG.info("Relevance model estimated.");
rm = FeatureVector.interpolate(qfv, rm, originalQueryWeight);
StringBuilder builder = new StringBuilder();
Iterator<String> terms = rm.iterator();
while (terms.hasNext()) {
String term = terms.next();
double prob = rm.getFeatureWeight(term);
builder.append(term + "^" + prob + " ");
}
String queryText = builder.toString().trim();
QueryParser p = new QueryParser(field, new WhitespaceAnalyzer());
Query nq = null;
try {
nq = p.parse(queryText);
} catch (ParseException e) {
e.printStackTrace();
return docs;
}
LOG.info("Running new query: " + nq);
TopDocs rs = null;
try {
if (context.getFilter() == null) {
rs = searcher.search(nq, 1000);
} else {
BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder();
bqBuilder.add(context.getFilter(), BooleanClause.Occur.FILTER);
bqBuilder.add(nq, BooleanClause.Occur.MUST);
Query q = bqBuilder.build();
rs = searcher.search(q, 1000);
}
} catch (IOException e) {
e.printStackTrace();
return docs;
}
return ScoredDocuments.fromTopDocs(rs, searcher);
}
use of org.apache.lucene.queryparser.classic.QueryParser in project Anserini by castorini.
the class EntityLinking method search.
/**
* Returns a list of query results.
*
* @param queryName the entity name to search
* @throws Exception on error
* @return a list of top ranked entities
*/
public List<RankedEntity> search(String queryName, int numHits) throws Exception {
List<RankedEntity> rankedEntities = new ArrayList<>();
// Initialize index searcher
IndexSearcher searcher = new IndexSearcher(reader);
// do exact search on query name
QueryParser queryParser = new QueryParser(IndexTopics.FIELD_NAME, new SimpleAnalyzer());
queryParser.setAutoGeneratePhraseQueries(true);
queryParser.setPhraseSlop(3);
queryName = "\"" + queryName + "\"";
Query query = queryParser.parse(queryName);
TopDocs rs = searcher.search(query, numHits);
ScoredDocuments docs = ScoredDocuments.fromTopDocs(rs, searcher);
for (int i = 0; i < docs.documents.length; i++) {
float score = docs.scores[i];
String mid = docs.documents[i].getField(IndexTopics.FIELD_TOPIC_MID).stringValue();
String shortMid = getShortMid(mid);
String name = docs.documents[i].getField(IndexTopics.FIELD_NAME).stringValue();
String label = docs.documents[i].getField(IndexTopics.FIELD_LABEL).stringValue();
rankedEntities.add(new RankedEntity(shortMid, score, name, label));
}
if (docs.documents.length >= numHits) {
return rankedEntities;
}
int numHitsLeft = numHits - docs.documents.length;
// do TFIDF search
Similarity similarity = new ClassicSimilarity();
searcher.setSimilarity(similarity);
queryParser = new MultiFieldQueryParser(new String[] { IndexTopics.FIELD_NAME, IndexTopics.FIELD_LABEL }, new SimpleAnalyzer());
queryParser.setDefaultOperator(QueryParser.Operator.AND);
query = queryParser.parse(queryName);
rs = searcher.search(query, numHitsLeft);
docs = ScoredDocuments.fromTopDocs(rs, searcher);
for (int i = 0; i < docs.documents.length; i++) {
float score = docs.scores[i];
String mid = docs.documents[i].getField(IndexTopics.FIELD_TOPIC_MID).stringValue();
String shortMid = getShortMid(mid);
String name = docs.documents[i].getField(IndexTopics.FIELD_NAME).stringValue();
String label = docs.documents[i].getField(IndexTopics.FIELD_LABEL).stringValue();
rankedEntities.add(new RankedEntity(shortMid, score, name, label));
}
return rankedEntities;
}
use of org.apache.lucene.queryparser.classic.QueryParser in project Anserini by castorini.
the class TrainingDataGenerator method birthdate.
/**
* Generate training data for property birth date
* <p>
* Note: this function might need some refactoring when we add more properties
*/
void birthdate() throws ParseException, IOException {
QueryParser queryParser = new QueryParser(FIELD_BIRTHDATE, getKbIndexAnalyzer());
queryParser.setAllowLeadingWildcard(true);
Query q = queryParser.parse("*");
LOG.info("Starting the search using query: {}", q.toString());
// Collect all matching documents in a set of matching doc ids
Set<Integer> matchingDocIds = new HashSet<>();
getKbIndexSearcher().search(q, new CheckHits.SetCollector(matchingDocIds));
LOG.info("Found {} matching documents, retrieving...", matchingDocIds.size());
// Process the retrieved document ids
matchingDocIds.forEach((Integer docId) -> {
Document doc = null;
try {
doc = getKbIndexReader().document(docId);
} catch (IOException e) {
LOG.warn("Error retrieving document with id: {}. Ignoring.", docId);
return;
}
String freebaseURI = doc.get(IndexNodes.FIELD_ID);
// We might have multiple values for the field
String[] birthdates = doc.getValues(FIELD_BIRTHDATE);
// Get the freebase English label of this entity
String[] labels = doc.getValues(FIELD_LABEL);
String englishLabel = null;
for (String label : labels) {
Literal literal = NTriplesUtil.parseLiteral(label, valueFactory);
if (literal.getLanguage().orElse("N/A").toLowerCase().equals("en")) {
englishLabel = literal.stringValue();
break;
}
}
// Basically make sure label is not null, for some entities in freebase
if (englishLabel == null || freebaseURI == null || birthdates == null || birthdates.length == 0)
// Ignore this search
return;
String freebaseId = freebaseUriToFreebaseId(freebaseURI);
for (String birthdate : birthdates) {
// Get string value
String birthdateVal = extractValueFromTypedLiteralString(birthdate);
// Write property value as training data
writeToTrainingFile(TRAINING_DATA_OUTPUT_FILE_EXAMPLES, freebaseId, englishLabel, birthdateVal);
}
// TODO - After building an index for the mentions of Freebase entities in ClueWeb,
// we need to get the ClueWeb mentions of this freebase entity and write them to a separate file
});
}
use of org.apache.lucene.queryparser.classic.QueryParser in project Anserini by castorini.
the class PyseriniEntryPoint method search.
/**
* Prints TREC submission file to the standard output stream.
*
* @param topics queries
* @param similarity similarity
* @throws IOException
* @throws ParseException
*/
public Map<String, Float> search(SortedMap<Integer, String> topics, Similarity similarity, int numHits, RerankerCascade cascade, boolean useQueryParser, boolean keepstopwords) throws IOException, ParseException {
Map<String, Float> scoredDocs = new LinkedHashMap<>();
IndexSearcher searcher = new IndexSearcher(reader);
searcher.setSimilarity(similarity);
EnglishAnalyzer ea = keepstopwords ? new EnglishAnalyzer(CharArraySet.EMPTY_SET) : new EnglishAnalyzer();
QueryParser queryParser = new QueryParser(FIELD_BODY, ea);
queryParser.setDefaultOperator(QueryParser.Operator.OR);
for (Map.Entry<Integer, String> entry : topics.entrySet()) {
int qID = entry.getKey();
String queryString = entry.getValue();
Query query = useQueryParser ? queryParser.parse(queryString) : AnalyzerUtils.buildBagOfWordsQuery(FIELD_BODY, ea, queryString);
TopDocs rs = searcher.search(query, numHits);
ScoreDoc[] hits = rs.scoreDocs;
List<String> queryTokens = AnalyzerUtils.tokenize(ea, queryString);
RerankerContext context = new RerankerContext(searcher, query, String.valueOf(qID), queryString, queryTokens, FIELD_BODY, null);
ScoredDocuments docs = cascade.run(ScoredDocuments.fromTopDocs(rs, searcher), context);
for (int i = 0; i < docs.documents.length; i++) {
String docid = docs.documents[i].getField(FIELD_ID).stringValue();
float score = docs.scores[i];
scoredDocs.put(docid, score);
}
}
return scoredDocs;
}
use of org.apache.lucene.queryparser.classic.QueryParser in project Anserini by castorini.
the class RetrieveSentences method search.
public Map<String, Float> search(SortedMap<Integer, String> topics, int numHits) throws IOException, ParseException {
IndexSearcher searcher = new IndexSearcher(reader);
// using BM25 scoring model
Similarity similarity = new BM25Similarity(0.9f, 0.4f);
searcher.setSimilarity(similarity);
EnglishAnalyzer ea = new EnglishAnalyzer();
QueryParser queryParser = new QueryParser(FIELD_BODY, ea);
queryParser.setDefaultOperator(QueryParser.Operator.OR);
Map<String, Float> scoredDocs = new LinkedHashMap<>();
for (Map.Entry<Integer, String> entry : topics.entrySet()) {
int qID = entry.getKey();
String queryString = entry.getValue();
Query query = AnalyzerUtils.buildBagOfWordsQuery(FIELD_BODY, ea, queryString);
TopDocs rs = searcher.search(query, numHits);
ScoreDoc[] hits = rs.scoreDocs;
ScoredDocuments docs = ScoredDocuments.fromTopDocs(rs, searcher);
for (int i = 0; i < docs.documents.length; i++) {
scoredDocs.put(docs.documents[i].getField(FIELD_ID).stringValue(), docs.scores[i]);
}
}
return scoredDocs;
}
Aggregations