use of org.apache.lucene.queryparser.classic.MultiFieldQueryParser in project textdb by TextDB.
the class FuzzyTokenMatcherSourceOperator method createLuceneQueryObject.
public static Query createLuceneQueryObject(FuzzyTokenPredicate predicate) throws DataFlowException {
try {
/*
* By default the boolean query takes 1024 # of clauses as the max
* limit. Since our input query has no limitaion on the number of
* tokens, we have to put a check.
*/
if (predicate.getThreshold() > 1024)
BooleanQuery.setMaxClauseCount(predicate.getThreshold() + 1);
BooleanQuery.Builder builder = new BooleanQuery.Builder();
builder.setMinimumNumberShouldMatch(predicate.getThreshold());
MultiFieldQueryParser qp = new MultiFieldQueryParser(predicate.getAttributeNames().stream().toArray(String[]::new), LuceneAnalyzerConstants.getLuceneAnalyzer(predicate.getLuceneAnalyzerStr()));
for (String s : predicate.getQueryTokens()) {
builder.add(qp.parse(s), Occur.SHOULD);
}
return builder.build();
} catch (ParseException e) {
throw new DataFlowException(e);
}
}
use of org.apache.lucene.queryparser.classic.MultiFieldQueryParser in project Anserini by castorini.
the class LookupTopic method search.
/**
* Prints query results to the standard output stream.
*
* @param queryName the entity name to search
* @throws Exception on error
*/
public void search(String queryName) throws Exception {
LOG.info("Querying started...");
// Initialize index searcher
IndexSearcher searcher = new IndexSearcher(reader);
SimpleAnalyzer analyzer = new SimpleAnalyzer();
int numHits = 20;
// find exact title
QueryParser titleParser = new QueryParser(TopicLuceneDocumentGenerator.FIELD_TITLE, analyzer);
Query titleQuery = titleParser.parse(queryName);
TopDocs rs = searcher.search(titleQuery, numHits);
ScoredDocuments docs = ScoredDocuments.fromTopDocs(rs, searcher);
for (int i = 0; i < docs.documents.length; i++) {
String resultDoc = String.format("%d - SCORE: %f\nTOPIC_MID: %s\nWIKI_TITLE: %s\nW3_LABEL: %s\n\n", (i + 1), docs.scores[i], docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TOPIC_MID).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TITLE).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_LABEL).stringValue());
System.out.println(resultDoc);
}
if (docs.documents.length != 0) {
System.out.println("Exact WIKI_TITLE found! Ending search.");
return;
} else {
System.out.println("Exact WIKI_TITLE not found. Searching for the label...");
}
System.out.println();
// find exact label
QueryParser labelParser = new QueryParser(TopicLuceneDocumentGenerator.FIELD_LABEL, analyzer);
Query labelQuery = labelParser.parse(queryName);
rs = searcher.search(labelQuery, numHits);
docs = ScoredDocuments.fromTopDocs(rs, searcher);
for (int i = 0; i < docs.documents.length; i++) {
String resultDoc = String.format("%d - SCORE: %f\nTOPIC_MID: %s\nWIKI_TITLE: %s\nW3_LABEL: %s\n\n", (i + 1), docs.scores[i], docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TOPIC_MID).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TITLE).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_LABEL).stringValue());
System.out.println(resultDoc);
}
if (docs.documents.length != 0) {
System.out.println("Exact W3_LABEL found! Ending search.");
return;
} else {
System.out.println("Exact W3_LABEL not found. Ranking the topics using BM25 according the text/title/label...");
}
System.out.println();
float k1 = 1.5f;
float b = 0.75f;
Similarity similarity = new BM25Similarity(k1, b);
searcher.setSimilarity(similarity);
MultiFieldQueryParser queryParser = new MultiFieldQueryParser(new String[] { TopicLuceneDocumentGenerator.FIELD_TITLE, TopicLuceneDocumentGenerator.FIELD_LABEL, TopicLuceneDocumentGenerator.FIELD_TEXT }, analyzer);
queryParser.setDefaultOperator(QueryParser.Operator.OR);
Query query = queryParser.parse(queryName);
rs = searcher.search(query, numHits);
docs = ScoredDocuments.fromTopDocs(rs, searcher);
for (int i = 0; i < docs.documents.length; i++) {
String resultDoc = String.format("%d - SCORE: %f\nTOPIC_MID: %s\nWIKI_TITLE: %s\nW3_LABEL: %s\n", (i + 1), docs.scores[i], docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TOPIC_MID).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TITLE).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_LABEL).stringValue());
System.out.println(resultDoc);
}
LOG.info("Querying completed.");
}
use of org.apache.lucene.queryparser.classic.MultiFieldQueryParser in project Anserini by castorini.
the class LookupTopic method search.
/**
* Prints all known facts about a particular mid.
* @param queryName query topic name
* @throws Exception on error
*/
public void search(String queryName, int numHits) throws Exception {
// Initialize index searcher
IndexSearcher searcher = new IndexSearcher(reader);
// search for query in multiple fields
MultiFieldQueryParser queryParser = new MultiFieldQueryParser(new String[] { IndexTopics.FIELD_NAME, IndexTopics.FIELD_LABEL, IndexTopics.FIELD_ALIAS }, new SimpleAnalyzer());
queryParser.setDefaultOperator(QueryParser.Operator.OR);
Query query = queryParser.parse(queryName);
TopDocs rs = searcher.search(query, numHits);
ScoredDocuments docs = ScoredDocuments.fromTopDocs(rs, searcher);
for (int i = 0; i < docs.documents.length; i++) {
String resultDoc = String.format("%d - SCORE: %f\nTOPIC_MID: %s\nOBJECT_NAME: %s\nWIKI_TITLE: %s\nW3_LABEL: %s\n", (i + 1), docs.scores[i], docs.documents[i].getField(IndexTopics.FIELD_TOPIC_MID).stringValue(), docs.documents[i].getField(IndexTopics.FIELD_NAME).stringValue(), docs.documents[i].getField(IndexTopics.FIELD_ALIAS).stringValue(), docs.documents[i].getField(IndexTopics.FIELD_LABEL).stringValue());
System.out.println(resultDoc);
}
}
use of org.apache.lucene.queryparser.classic.MultiFieldQueryParser in project carbondata by apache.
the class LuceneFineGrainDataMap method prune.
/**
* Prune the datamap with filter expression. It returns the list of
* blocklets where these filters can exist.
*/
@Override
public List<FineGrainBlocklet> prune(FilterResolverIntf filterExp, SegmentProperties segmentProperties, List<PartitionSpec> partitions) throws IOException {
// convert filter expr into lucene list query
List<String> fields = new ArrayList<String>();
// only for test , query all data
String strQuery = getQueryString(filterExp.getFilterExpression());
String[] sFields = new String[fields.size()];
fields.toArray(sFields);
// get analyzer
if (analyzer == null) {
analyzer = new StandardAnalyzer();
}
// use MultiFieldQueryParser to parser query
QueryParser queryParser = new MultiFieldQueryParser(sFields, analyzer);
Query query;
try {
query = queryParser.parse(strQuery);
} catch (ParseException e) {
String errorMessage = String.format("failed to filter block with query %s, detail is %s", strQuery, e.getMessage());
LOGGER.error(errorMessage);
return null;
}
// execute index search
TopDocs result;
try {
result = indexSearcher.search(query, MAX_RESULT_NUMBER);
} catch (IOException e) {
String errorMessage = String.format("failed to search lucene data, detail is %s", e.getMessage());
LOGGER.error(errorMessage);
throw new IOException(errorMessage);
}
// temporary data, delete duplicated data
// Map<BlockId, Map<BlockletId, Map<PageId, Set<RowId>>>>
Map<String, Map<String, Map<Integer, Set<Integer>>>> mapBlocks = new HashMap<>();
for (ScoreDoc scoreDoc : result.scoreDocs) {
// get a document
Document doc = indexSearcher.doc(scoreDoc.doc);
// get all fields
List<IndexableField> fieldsInDoc = doc.getFields();
// get this block id Map<BlockId, Map<BlockletId, Map<PageId, Set<RowId>>>>
String blockId = fieldsInDoc.get(BLOCKID_ID).stringValue();
Map<String, Map<Integer, Set<Integer>>> mapBlocklets = mapBlocks.get(blockId);
if (mapBlocklets == null) {
mapBlocklets = new HashMap<>();
mapBlocks.put(blockId, mapBlocklets);
}
// get the blocklet id Map<BlockletId, Map<PageId, Set<RowId>>>
String blockletId = fieldsInDoc.get(BLOCKLETID_ID).stringValue();
Map<Integer, Set<Integer>> mapPageIds = mapBlocklets.get(blockletId);
if (mapPageIds == null) {
mapPageIds = new HashMap<>();
mapBlocklets.put(blockletId, mapPageIds);
}
// get the page id Map<PageId, Set<RowId>>
Number pageId = fieldsInDoc.get(PAGEID_ID).numericValue();
Set<Integer> setRowId = mapPageIds.get(pageId.intValue());
if (setRowId == null) {
setRowId = new HashSet<>();
mapPageIds.put(pageId.intValue(), setRowId);
}
// get the row id Set<RowId>
Number rowId = fieldsInDoc.get(ROWID_ID).numericValue();
setRowId.add(rowId.intValue());
}
// result blocklets
List<FineGrainBlocklet> blocklets = new ArrayList<>();
// Map<BlockId, Map<BlockletId, Map<PageId, Set<RowId>>>>
for (Map.Entry<String, Map<String, Map<Integer, Set<Integer>>>> mapBlock : mapBlocks.entrySet()) {
String blockId = mapBlock.getKey();
Map<String, Map<Integer, Set<Integer>>> mapBlocklets = mapBlock.getValue();
// for blocklets in this block Map<BlockletId, Map<PageId, Set<RowId>>>
for (Map.Entry<String, Map<Integer, Set<Integer>>> mapBlocklet : mapBlocklets.entrySet()) {
String blockletId = mapBlocklet.getKey();
Map<Integer, Set<Integer>> mapPageIds = mapBlocklet.getValue();
List<FineGrainBlocklet.Page> pages = new ArrayList<FineGrainBlocklet.Page>();
// for pages in this blocklet Map<PageId, Set<RowId>>>
for (Map.Entry<Integer, Set<Integer>> mapPageId : mapPageIds.entrySet()) {
// construct array rowid
int[] rowIds = new int[mapPageId.getValue().size()];
int i = 0;
// for rowids in this page Set<RowId>
for (Integer rowid : mapPageId.getValue()) {
rowIds[i++] = rowid;
}
// construct one page
FineGrainBlocklet.Page page = new FineGrainBlocklet.Page();
page.setPageId(mapPageId.getKey());
page.setRowId(rowIds);
// add this page into list pages
pages.add(page);
}
// add a FineGrainBlocklet
blocklets.add(new FineGrainBlocklet(blockId, blockletId, pages));
}
}
return blocklets;
}
use of org.apache.lucene.queryparser.classic.MultiFieldQueryParser in project epadd by ePADD.
the class Highlighter method highlight.
private static String highlight(String content, String term, String preTag, String postTag) throws IOException, ParseException, InvalidTokenOffsetsException {
// The Lucene Highlighter is used in a hacky way here, it is intended to be used to retrieve fragments from a matching Lucene document.
// The Lucene Highlighter introduces tags around every token that matched the query, hence it is required to merge these fragmented annotations into one inorder to fit our needs.
// To truly differentiate contiguous fragments that match a term supplied we add a unique id to the pretag, hence the randum instance
// TODO: Explain what is happening here
// Version lv = Indexer.LUCENE_VERSION;
// hell with reset close, stuff. initialized two analyzers to evade the problem.
// TODO: get rid of two analyzers.
Analyzer snAnalyzer, snAnalyzer2;
snAnalyzer = new EnglishNumberAnalyzer(CharArraySet.EMPTY_SET);
snAnalyzer2 = new EnglishNumberAnalyzer(CharArraySet.EMPTY_SET);
Fragmenter fragmenter = new NullFragmenter();
QueryParser qp = new MultiFieldQueryParser(new String[] { "" }, snAnalyzer2);
BooleanQuery.Builder querybuilder = new BooleanQuery.Builder();
TokenStream stream = snAnalyzer.tokenStream(null, new StringReader(content));
int r = randnum.nextInt();
String upreTag = preTag.replaceAll(">$", " data-ignore=" + r + " >");
Formatter formatter = new SimpleHTMLFormatter(upreTag, postTag);
// Parse exception may occur while parsing terms like "AND", "OR" etc.
try {
querybuilder.add(new BooleanClause(qp.parse(term), BooleanClause.Occur.SHOULD));
} catch (ParseException pe) {
if (log.isDebugEnabled())
log.debug("Exception while parsing: " + term, pe);
return content;
}
Scorer scorer = new QueryScorer(querybuilder.build());
org.apache.lucene.search.highlight.Highlighter highlighter = new org.apache.lucene.search.highlight.Highlighter(formatter, scorer);
highlighter.setTextFragmenter(fragmenter);
highlighter.setMaxDocCharsToAnalyze(Math.max(org.apache.lucene.search.highlight.Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE, content.length()));
String result = highlighter.getBestFragment(stream, content);
snAnalyzer.close();
snAnalyzer2.close();
if (result != null) {
result = mergeContiguousFragments(result, term, upreTag, postTag);
// and then remove the extra info. we appended to the tags
result = result.replaceAll(" data-ignore=" + r + " >", ">");
return result;
} else
return content;
}
Aggregations