use of org.apache.lucene.queryparser.classic.QueryParser in project Anserini by castorini.
the class IdfPassageScorer method score.
@Override
public void score(String query, Map<String, Float> sentences) throws Exception {
EnglishAnalyzer englishAnalyzer = new EnglishAnalyzer(StopFilter.makeStopSet(stopWords));
QueryParser queryParser = new QueryParser(LuceneDocumentGenerator.FIELD_BODY, englishAnalyzer);
ClassicSimilarity similarity = new ClassicSimilarity();
String escapedQuery = queryParser.escape(query);
Query question = queryParser.parse(escapedQuery);
HashSet<String> questionTerms = new HashSet<>(Arrays.asList(question.toString().trim().toLowerCase().split("\\s+")));
EnglishAnalyzer englishAnalyzerWithStop = new EnglishAnalyzer(CharArraySet.EMPTY_SET);
QueryParser queryParserWithStop = new QueryParser(LuceneDocumentGenerator.FIELD_BODY, englishAnalyzerWithStop);
Query questionWithStopWords = queryParserWithStop.parse(escapedQuery);
HashSet<String> questionTermsIDF = new HashSet<>(Arrays.asList(questionWithStopWords.toString().trim().toLowerCase().split("\\s+")));
// add the question terms to the termIDF Map
for (String questionTerm : questionTermsIDF) {
try {
TermQuery q = (TermQuery) queryParserWithStop.parse(questionTerm);
Term t = q.getTerm();
double termIDF = similarity.idf(reader.docFreq(t), reader.numDocs());
termIdfMap.put(questionTerm, String.valueOf(termIDF));
} catch (Exception e) {
continue;
}
}
// avoid duplicate passages
HashSet<String> seenSentences = new HashSet<>();
for (Map.Entry<String, Float> sent : sentences.entrySet()) {
double idf = 0.0;
HashSet<String> seenTerms = new HashSet<>();
String[] terms = sent.getKey().toLowerCase().split("\\s+");
for (String term : terms) {
try {
TermQuery q = (TermQuery) queryParser.parse(term);
Term t = q.getTerm();
double termIDF = similarity.idf(reader.docFreq(t), reader.numDocs());
if (questionTerms.contains(t.toString()) && !seenTerms.contains(t.toString())) {
idf += termIDF;
seenTerms.add(t.toString());
}
TermQuery q2 = (TermQuery) queryParserWithStop.parse(term);
Term t2 = q2.getTerm();
double termIDFwithStop = similarity.idf(reader.docFreq(t2), reader.numDocs());
termIdfMap.put(term, String.valueOf(termIDFwithStop));
} catch (Exception e) {
continue;
}
}
double weightedScore = idf + 0.0001 * sent.getValue();
ScoredPassage scoredPassage = new ScoredPassage(sent.getKey(), weightedScore, sent.getValue());
if ((scoredPassageHeap.size() < topPassages || weightedScore > scoredPassageHeap.peekLast().getScore()) && !seenSentences.contains(sent)) {
if (scoredPassageHeap.size() == topPassages) {
scoredPassageHeap.pollLast();
}
scoredPassageHeap.add(scoredPassage);
seenSentences.add(sent.getKey());
}
}
}
use of org.apache.lucene.queryparser.classic.QueryParser in project Anserini by castorini.
the class SearchWebCollection method search.
/**
* Prints TREC submission file to the standard output stream.
*
* @param topics queries
* @param similarity similarity
* @throws IOException
* @throws ParseException
*/
public void search(SortedMap<Integer, String> topics, String submissionFile, Similarity similarity, int numHits, RerankerCascade cascade, boolean useQueryParser, boolean keepstopwords) throws IOException, ParseException {
IndexSearcher searcher = new IndexSearcher(reader);
searcher.setSimilarity(similarity);
final String runTag = "BM25_EnglishAnalyzer_" + (keepstopwords ? "KeepStopwords_" : "") + FIELD_BODY + "_" + similarity.toString();
PrintWriter out = new PrintWriter(Files.newBufferedWriter(Paths.get(submissionFile), StandardCharsets.US_ASCII));
EnglishAnalyzer ea = keepstopwords ? new EnglishAnalyzer(CharArraySet.EMPTY_SET) : new EnglishAnalyzer();
QueryParser queryParser = new QueryParser(FIELD_BODY, ea);
queryParser.setDefaultOperator(QueryParser.Operator.OR);
for (Map.Entry<Integer, String> entry : topics.entrySet()) {
int qID = entry.getKey();
String queryString = entry.getValue();
Query query = useQueryParser ? queryParser.parse(queryString) : AnalyzerUtils.buildBagOfWordsQuery(FIELD_BODY, ea, queryString);
/**
* For Web Tracks 2010,2011,and 2012; an experimental run consists of the top 10,000 documents for each topic query.
*/
TopDocs rs = searcher.search(query, numHits);
ScoreDoc[] hits = rs.scoreDocs;
List<String> queryTokens = AnalyzerUtils.tokenize(ea, queryString);
RerankerContext context = new RerankerContext(searcher, query, String.valueOf(qID), queryString, queryTokens, FIELD_BODY, null);
ScoredDocuments docs = cascade.run(ScoredDocuments.fromTopDocs(rs, searcher), context);
/**
* the first column is the topic number.
* the second column is currently unused and should always be "Q0".
* the third column is the official document identifier of the retrieved document.
* the fourth column is the rank the document is retrieved.
* the fifth column shows the score (integer or floating point) that generated the ranking.
* the sixth column is called the "run tag" and should be a unique identifier for your
*/
for (int i = 0; i < docs.documents.length; i++) {
out.println(String.format("%d Q0 %s %d %f %s", qID, docs.documents[i].getField(FIELD_ID).stringValue(), (i + 1), docs.scores[i], runTag));
}
}
out.flush();
out.close();
}
use of org.apache.lucene.queryparser.classic.QueryParser in project jspwiki by apache.
the class LuceneSearchProvider method findPages.
/**
* Searches pages using a particular combination of flags.
*
* @param query The query to perform in Lucene query language
* @param flags A set of flags
* @return A Collection of SearchResult instances
* @throws ProviderException if there is a problem with the backend
*/
public Collection findPages(String query, int flags, WikiContext wikiContext) throws ProviderException {
IndexSearcher searcher = null;
ArrayList<SearchResult> list = null;
Highlighter highlighter = null;
try {
String[] queryfields = { LUCENE_PAGE_CONTENTS, LUCENE_PAGE_NAME, LUCENE_AUTHOR, LUCENE_ATTACHMENTS };
QueryParser qp = new MultiFieldQueryParser(Version.LUCENE_47, queryfields, getLuceneAnalyzer());
// QueryParser qp = new QueryParser( LUCENE_PAGE_CONTENTS, getLuceneAnalyzer() );
Query luceneQuery = qp.parse(query);
if ((flags & FLAG_CONTEXTS) != 0) {
highlighter = new Highlighter(new SimpleHTMLFormatter("<span class=\"searchmatch\">", "</span>"), new SimpleHTMLEncoder(), new QueryScorer(luceneQuery));
}
try {
File dir = new File(m_luceneDirectory);
Directory luceneDir = new SimpleFSDirectory(dir, null);
IndexReader reader = DirectoryReader.open(luceneDir);
searcher = new IndexSearcher(reader);
} catch (Exception ex) {
log.info("Lucene not yet ready; indexing not started", ex);
return null;
}
ScoreDoc[] hits = searcher.search(luceneQuery, MAX_SEARCH_HITS).scoreDocs;
AuthorizationManager mgr = m_engine.getAuthorizationManager();
list = new ArrayList<SearchResult>(hits.length);
for (int curr = 0; curr < hits.length; curr++) {
int docID = hits[curr].doc;
Document doc = searcher.doc(docID);
String pageName = doc.get(LUCENE_ID);
WikiPage page = m_engine.getPage(pageName, WikiPageProvider.LATEST_VERSION);
if (page != null) {
if (page instanceof Attachment) {
// Currently attachments don't look nice on the search-results page
// When the search-results are cleaned up this can be enabled again.
}
PagePermission pp = new PagePermission(page, PagePermission.VIEW_ACTION);
if (mgr.checkPermission(wikiContext.getWikiSession(), pp)) {
int score = (int) (hits[curr].score * 100);
// Get highlighted search contexts
String text = doc.get(LUCENE_PAGE_CONTENTS);
String[] fragments = new String[0];
if (text != null && highlighter != null) {
TokenStream tokenStream = getLuceneAnalyzer().tokenStream(LUCENE_PAGE_CONTENTS, new StringReader(text));
fragments = highlighter.getBestFragments(tokenStream, text, MAX_FRAGMENTS);
}
SearchResult result = new SearchResultImpl(page, score, fragments);
list.add(result);
}
} else {
log.error("Lucene found a result page '" + pageName + "' that could not be loaded, removing from Lucene cache");
pageRemoved(new WikiPage(m_engine, pageName));
}
}
} catch (IOException e) {
log.error("Failed during lucene search", e);
} catch (ParseException e) {
log.info("Broken query; cannot parse query ", e);
throw new ProviderException("You have entered a query Lucene cannot process: " + e.getMessage());
} catch (InvalidTokenOffsetsException e) {
log.error("Tokens are incompatible with provided text ", e);
} finally {
if (searcher != null) {
try {
searcher.getIndexReader().close();
} catch (IOException e) {
log.error(e);
}
}
}
return list;
}
use of org.apache.lucene.queryparser.classic.QueryParser in project derby by apache.
the class LuceneQueryVTI method initScan.
// ///////////////////////////////////////////////////////////////////
//
// MINIONS
//
// ///////////////////////////////////////////////////////////////////
/**
* Initialize the metadata and scan
*/
private void initScan() throws SQLException {
try {
// read the execution context for this AwareVTI
VTIContext context = getContext();
_schema = context.vtiSchema();
String[] nameParts = LuceneSupport.decodeFunctionName(context.vtiTable());
_table = nameParts[LuceneSupport.TABLE_PART];
_column = nameParts[LuceneSupport.COLUMN_PART];
// divine the column names
VTITemplate.ColumnDescriptor[] returnColumns = getReturnTableSignature(_connection);
String[] columnNames = new String[returnColumns.length];
for (int i = 0; i < returnColumns.length; i++) {
columnNames[i] = returnColumns[i].columnName;
}
setColumnNames(columnNames);
_scoreColumnID = getColumnCount();
_docIDColumnID = _scoreColumnID - 1;
_maxKeyID = _docIDColumnID - 1;
_minKeyID = 1;
// make sure the user has SELECT privilege on all relevant columns of the underlying table
vetPrivileges();
String delimitedColumnName = LuceneSupport.delimitID(_column);
DerbyLuceneDir derbyLuceneDir = LuceneSupport.getDerbyLuceneDir(_connection, _schema, _table, delimitedColumnName);
StorageFile propertiesFile = LuceneSupport.getIndexPropertiesFile(derbyLuceneDir);
Properties indexProperties = readIndexProperties(propertiesFile);
String indexDescriptorMaker = indexProperties.getProperty(LuceneSupport.INDEX_DESCRIPTOR_MAKER);
LuceneIndexDescriptor indexDescriptor = getIndexDescriptor(indexDescriptorMaker);
Analyzer analyzer = indexDescriptor.getAnalyzer();
QueryParser qp = indexDescriptor.getQueryParser();
vetLuceneVersion(indexProperties.getProperty(LuceneSupport.LUCENE_VERSION));
_indexReader = getIndexReader(derbyLuceneDir);
_searcher = new IndexSearcher(_indexReader);
Query luceneQuery = qp.parse(_queryText);
TopScoreDocCollector tsdc = TopScoreDocCollector.create(_windowSize, true);
if (_scoreCeiling != null) {
tsdc = TopScoreDocCollector.create(_windowSize, new ScoreDoc(0, _scoreCeiling), true);
}
searchAndScore(luceneQuery, tsdc);
} catch (IOException ioe) {
throw ToolUtilities.wrap(ioe);
} catch (ParseException pe) {
throw ToolUtilities.wrap(pe);
} catch (PrivilegedActionException pae) {
throw ToolUtilities.wrap(pae);
}
}
use of org.apache.lucene.queryparser.classic.QueryParser in project jackrabbit-oak by apache.
the class LuceneIndex method query.
@Override
public Cursor query(final IndexPlan plan, NodeState rootState) {
final Filter filter = plan.getFilter();
FullTextExpression ft = filter.getFullTextConstraint();
final Set<String> relPaths = getRelativePaths(ft);
if (relPaths.size() > 1) {
return new MultiLuceneIndex(filter, rootState, relPaths).query();
}
final String parent = relPaths.size() == 0 ? "" : relPaths.iterator().next();
// we only restrict non-full-text conditions if there is
// no relative property in the full-text constraint
final boolean nonFullTextConstraints = parent.isEmpty();
final int parentDepth = getDepth(parent);
QueryLimits settings = filter.getQueryLimits();
Iterator<LuceneResultRow> itr = new AbstractIterator<LuceneResultRow>() {
private final Deque<LuceneResultRow> queue = Queues.newArrayDeque();
private final Set<String> seenPaths = Sets.newHashSet();
private ScoreDoc lastDoc;
private int nextBatchSize = LUCENE_QUERY_BATCH_SIZE;
private boolean noDocs = false;
private long lastSearchIndexerVersion;
private int reloadCount;
@Override
protected LuceneResultRow computeNext() {
while (!queue.isEmpty() || loadDocs()) {
return queue.remove();
}
return endOfData();
}
private LuceneResultRow convertToRow(ScoreDoc doc, IndexSearcher searcher, String excerpt) throws IOException {
IndexReader reader = searcher.getIndexReader();
PathStoredFieldVisitor visitor = new PathStoredFieldVisitor();
reader.document(doc.doc, visitor);
String path = visitor.getPath();
if (path != null) {
if ("".equals(path)) {
path = "/";
}
if (!parent.isEmpty()) {
// TODO OAK-828 this breaks node aggregation
// get the base path
// ensure the path ends with the given
// relative path
// if (!path.endsWith("/" + parent)) {
// continue;
// }
path = getAncestorPath(path, parentDepth);
// avoid duplicate entries
if (seenPaths.contains(path)) {
return null;
}
seenPaths.add(path);
}
return new LuceneResultRow(path, doc.score, excerpt);
}
return null;
}
/**
* Loads the lucene documents in batches
* @return true if any document is loaded
*/
private boolean loadDocs() {
if (noDocs) {
return false;
}
ScoreDoc lastDocToRecord = null;
IndexNode indexNode = tracker.acquireIndexNode((String) plan.getAttribute(ATTR_INDEX_PATH));
checkState(indexNode != null);
try {
IndexSearcher searcher = indexNode.getSearcher();
LuceneRequestFacade luceneRequestFacade = getLuceneRequest(filter, searcher.getIndexReader(), nonFullTextConstraints, indexNode.getDefinition());
if (luceneRequestFacade.getLuceneRequest() instanceof Query) {
Query query = (Query) luceneRequestFacade.getLuceneRequest();
TopDocs docs;
long time = System.currentTimeMillis();
checkForIndexVersionChange(searcher);
while (true) {
if (lastDoc != null) {
LOG.debug("loading the next {} entries for query {}", nextBatchSize, query);
docs = searcher.searchAfter(lastDoc, query, nextBatchSize);
} else {
LOG.debug("loading the first {} entries for query {}", nextBatchSize, query);
docs = searcher.search(query, nextBatchSize);
}
time = System.currentTimeMillis() - time;
LOG.debug("... took {} ms", time);
nextBatchSize = (int) Math.min(nextBatchSize * 2L, 100000);
PropertyRestriction restriction = filter.getPropertyRestriction(QueryConstants.REP_EXCERPT);
boolean addExcerpt = restriction != null && restriction.isNotNullRestriction();
Analyzer analyzer = indexNode.getDefinition().getAnalyzer();
if (addExcerpt) {
// setup highlighter
QueryScorer scorer = new QueryScorer(query);
scorer.setExpandMultiTermQuery(true);
highlighter.setFragmentScorer(scorer);
}
for (ScoreDoc doc : docs.scoreDocs) {
String excerpt = null;
if (addExcerpt) {
excerpt = getExcerpt(analyzer, searcher, doc);
}
LuceneResultRow row = convertToRow(doc, searcher, excerpt);
if (row != null) {
queue.add(row);
}
lastDocToRecord = doc;
}
if (queue.isEmpty() && docs.scoreDocs.length > 0) {
lastDoc = lastDocToRecord;
} else {
break;
}
}
} else if (luceneRequestFacade.getLuceneRequest() instanceof SpellcheckHelper.SpellcheckQuery) {
SpellcheckHelper.SpellcheckQuery spellcheckQuery = (SpellcheckHelper.SpellcheckQuery) luceneRequestFacade.getLuceneRequest();
noDocs = true;
SuggestWord[] suggestWords = SpellcheckHelper.getSpellcheck(spellcheckQuery);
// ACL filter spellchecks
Collection<String> suggestedWords = new ArrayList<String>(suggestWords.length);
QueryParser qp = new QueryParser(Version.LUCENE_47, FieldNames.SUGGEST, indexNode.getDefinition().getAnalyzer());
for (SuggestWord suggestion : suggestWords) {
Query query = qp.createPhraseQuery(FieldNames.SUGGEST, suggestion.string);
TopDocs topDocs = searcher.search(query, 100);
if (topDocs.totalHits > 0) {
for (ScoreDoc doc : topDocs.scoreDocs) {
Document retrievedDoc = searcher.doc(doc.doc);
if (filter.isAccessible(retrievedDoc.get(FieldNames.PATH))) {
suggestedWords.add(suggestion.string);
break;
}
}
}
}
queue.add(new LuceneResultRow(suggestedWords));
} else if (luceneRequestFacade.getLuceneRequest() instanceof SuggestHelper.SuggestQuery) {
SuggestHelper.SuggestQuery suggestQuery = (SuggestHelper.SuggestQuery) luceneRequestFacade.getLuceneRequest();
noDocs = true;
List<Lookup.LookupResult> lookupResults = SuggestHelper.getSuggestions(indexNode.getLookup(), suggestQuery);
// ACL filter suggestions
Collection<String> suggestedWords = new ArrayList<String>(lookupResults.size());
QueryParser qp = new QueryParser(Version.LUCENE_47, FieldNames.FULLTEXT, indexNode.getDefinition().getAnalyzer());
for (Lookup.LookupResult suggestion : lookupResults) {
Query query = qp.createPhraseQuery(FieldNames.FULLTEXT, suggestion.key.toString());
TopDocs topDocs = searcher.search(query, 100);
if (topDocs.totalHits > 0) {
for (ScoreDoc doc : topDocs.scoreDocs) {
Document retrievedDoc = searcher.doc(doc.doc);
if (filter.isAccessible(retrievedDoc.get(FieldNames.PATH))) {
suggestedWords.add("{term=" + suggestion.key + ",weight=" + suggestion.value + "}");
break;
}
}
}
}
queue.add(new LuceneResultRow(suggestedWords));
}
} catch (IOException e) {
LOG.warn("query via {} failed.", LuceneIndex.this, e);
} finally {
indexNode.release();
}
if (lastDocToRecord != null) {
this.lastDoc = lastDocToRecord;
}
return !queue.isEmpty();
}
private void checkForIndexVersionChange(IndexSearcher searcher) {
long currentVersion = LucenePropertyIndex.getVersion(searcher);
if (currentVersion != lastSearchIndexerVersion && lastDoc != null) {
reloadCount++;
if (reloadCount > MAX_RELOAD_COUNT) {
LOG.error("More than {} index version changes detected for query {}", MAX_RELOAD_COUNT, plan);
throw new IllegalStateException("Too many version changes");
}
lastDoc = null;
LOG.debug("Change in index version detected {} => {}. Query would be performed without " + "offset; reload {}", currentVersion, lastSearchIndexerVersion, reloadCount);
}
this.lastSearchIndexerVersion = currentVersion;
}
};
SizeEstimator sizeEstimator = new SizeEstimator() {
@Override
public long getSize() {
IndexNode indexNode = tracker.acquireIndexNode((String) plan.getAttribute(ATTR_INDEX_PATH));
checkState(indexNode != null);
try {
IndexSearcher searcher = indexNode.getSearcher();
LuceneRequestFacade luceneRequestFacade = getLuceneRequest(filter, searcher.getIndexReader(), nonFullTextConstraints, indexNode.getDefinition());
if (luceneRequestFacade.getLuceneRequest() instanceof Query) {
Query query = (Query) luceneRequestFacade.getLuceneRequest();
TotalHitCountCollector collector = new TotalHitCountCollector();
searcher.search(query, collector);
int totalHits = collector.getTotalHits();
LOG.debug("Estimated size for query {} is {}", query, totalHits);
return totalHits;
}
LOG.debug("Estimated size: not a Query: {}", luceneRequestFacade.getLuceneRequest());
} catch (IOException e) {
LOG.warn("query via {} failed.", LuceneIndex.this, e);
} finally {
indexNode.release();
}
return -1;
}
};
return new LucenePathCursor(itr, settings, sizeEstimator, filter);
}
Aggregations