use of org.apache.lucene.queryparser.classic.MultiFieldQueryParser in project textdb by TextDB.
the class RegexMatcherSourceOperator method createLuceneQuery.
public static Query createLuceneQuery(RegexSourcePredicate predicate) throws StorageException {
Query luceneQuery;
String queryString;
// Try to apply translator. If it fails, use scan query.
try {
queryString = RegexToGramQueryTranslator.translate(predicate.getRegex()).getLuceneQueryString();
} catch (com.google.re2j.PatternSyntaxException e) {
queryString = DataflowUtils.LUCENE_SCAN_QUERY;
}
// Try to parse the query string. It if fails, raise an exception.
try {
luceneQuery = new MultiFieldQueryParser(predicate.getAttributeNames().stream().toArray(String[]::new), RelationManager.getInstance().getTableAnalyzer(predicate.getTableName())).parse(queryString);
} catch (ParseException e) {
throw new StorageException(e);
}
return luceneQuery;
}
use of org.apache.lucene.queryparser.classic.MultiFieldQueryParser in project SSM by Intel-bigdata.
the class LuceneSearch method query.
/* (non-Javadoc)
* @see org.apache.zeppelin.search.Search#query(java.lang.String)
*/
@Override
public List<Map<String, String>> query(String queryStr) {
if (null == ramDirectory) {
throw new IllegalStateException("Something went wrong on instance creation time, index dir is null");
}
List<Map<String, String>> result = Collections.emptyList();
try (IndexReader indexReader = DirectoryReader.open(ramDirectory)) {
IndexSearcher indexSearcher = new IndexSearcher(indexReader);
Analyzer analyzer = new StandardAnalyzer();
MultiFieldQueryParser parser = new MultiFieldQueryParser(new String[] { SEARCH_FIELD_TEXT, SEARCH_FIELD_TITLE }, analyzer);
Query query = parser.parse(queryStr);
LOG.debug("Searching for: " + query.toString(SEARCH_FIELD_TEXT));
SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter();
Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query));
result = doSearch(indexSearcher, query, analyzer, highlighter);
indexReader.close();
} catch (IOException e) {
LOG.error("Failed to open index dir {}, make sure indexing finished OK", ramDirectory, e);
} catch (ParseException e) {
LOG.error("Failed to parse query " + queryStr, e);
}
return result;
}
use of org.apache.lucene.queryparser.classic.MultiFieldQueryParser in project neo4j by neo4j.
the class FulltextIndexReader method parseFulltextQuery.
private Query parseFulltextQuery(String query) throws ParseException {
MultiFieldQueryParser multiFieldQueryParser = new MultiFieldQueryParser(propertyNames, analyzer);
multiFieldQueryParser.setAllowLeadingWildcard(true);
return multiFieldQueryParser.parse(query);
}
use of org.apache.lucene.queryparser.classic.MultiFieldQueryParser in project epadd by ePADD.
the class Indexer method setupForRead.
/**
* sets up indexer just for reading... if needed for writing only, call
* setupForWrite. if need both read & write, call both.
*/
synchronized void setupForRead() {
log.info("setting up index for read only access");
long startTime = System.currentTimeMillis();
// closeHandles();
try {
setupDirectory();
String[] defaultSearchFields, defaultSearchFieldsOriginal;
// for subject only search
String[] defaultSearchFieldSubject = new String[] { "title" };
String[] defaultSearchFieldCorrespondents;
// For searching in the entity field of the document.
String[] defaultSearchFieldEntities = new String[] { "names", "en_names_title" };
// body field should be there, as the content of the attachment lies in this field, should also include meta field?
// why the search over en-names and en-names-original when body/body_original is included in the search fields?
defaultSearchFields = new String[] { "body", "title", "to_names", "from_names", "cc_names", "bcc_names", "to_emails", "from_emails", "cc_emails", "bcc_emails" };
// we want to leave title there because we want to always hit the title -- discussed with Peter June 27 2015
defaultSearchFieldsOriginal = new String[] { "body_original", "title" };
defaultSearchFieldCorrespondents = new String[] { "to_names", "from_names", "cc_names", "bcc_names", "to_emails", "from_emails", "cc_emails", "bcc_emails" };
// names field added above after email discussion with Sit 6/11/2013. problem is that we're not using the Lucene EnglishPossessiveFilter, so
// openNLPNER will extract the name Stanford University in a sentence like:
// "This is Stanford University's website."
// but when the user clicks on the name "Stanford University" in say monthly cards, we
// will not match the message with this sentence because of the apostrophe.
// for searching an attchment with fileName
String[] metaSearchFields = new String[] { "fileName" };
// Parse a simple query that searches for "text":
if (parser == null) {
// parser = new QueryParser(MUSE_LUCENE_VERSION, defaultSearchField, analyzer);
parser = new MultiFieldQueryParser(defaultSearchFields, analyzer);
// parserEntityFields = new MultiFieldQueryParser(defaultSearchFieldEntities);
parserOriginal = new MultiFieldQueryParser(defaultSearchFieldsOriginal, analyzer);
parserSubject = new MultiFieldQueryParser(defaultSearchFieldSubject, analyzer);
parserCorrespondents = new MultiFieldQueryParser(defaultSearchFieldCorrespondents, analyzer);
parserMeta = new MultiFieldQueryParser(metaSearchFields, new KeywordAnalyzer());
}
/**
* Bunch of gotchas here
* Its a bad idea to store lucene internal docIds, as no assumptions about the internal docIds should be made;
* not even that they are serial. When searching, lucene may ignore logically deleted docs.
* Lucene does not handle deleted docs, and having these docs in search may bring down the search performance by 50%
* Deleted docs are cleaned only during merging of indices.
*/
int numContentDocs = 0, numContentDeletedDocs = 0, numAttachmentDocs = 0, numAttachmentDeletedDocs = 0;
if (DirectoryReader.indexExists(directory)) {
DirectoryReader ireader = DirectoryReader.open(directory);
if (ireader.numDeletedDocs() > 0)
log.warn("!!!!!!!\nIndex reader has " + ireader.numDocs() + " doc(s) of which " + ireader.numDeletedDocs() + " are deleted)\n!!!!!!!!!!");
isearcher = new IndexSearcher(ireader);
contentDocIds = new LinkedHashMap<>();
numContentDocs = ireader.numDocs();
numContentDeletedDocs = ireader.numDeletedDocs();
Bits liveDocs = MultiFields.getLiveDocs(ireader);
Set<String> fieldsToLoad = new HashSet<>();
fieldsToLoad.add("docId");
/*for(int i=0;i<ireader.maxDoc();i++){
org.apache.lucene.document.Document doc = ireader.document(i,fieldsToLoad);
if(liveDocs!=null && !liveDocs.get(i))
continue;
if(doc == null || doc.get("docId") == null)
continue;
contentDocIds.put(i, doc.get("docId"));
}*/
log.info("Loaded: " + contentDocIds.size() + " content docs");
}
if (DirectoryReader.indexExists(directory_blob)) {
IndexReader ireader_blob = DirectoryReader.open(directory_blob);
// read-only=true
isearcher_blob = new IndexSearcher(ireader_blob);
blobDocIds = new LinkedHashMap<>();
numAttachmentDocs = ireader_blob.numDocs();
numAttachmentDeletedDocs = ireader_blob.numDeletedDocs();
Bits liveDocs = MultiFields.getLiveDocs(ireader_blob);
Set<String> fieldsToLoad = new HashSet<>();
fieldsToLoad.add("docId");
/*for(int i=0;i<ireader_blob.maxDoc();i++){
org.apache.lucene.document.Document doc = ireader_blob.document(i,fieldsToLoad);
if(liveDocs!=null && !liveDocs.get(i))
continue;
if(doc == null || doc.get("docId") == null)
continue;
blobDocIds.put(i,doc.get("docId"));
}*/
log.info("Loaded: " + blobDocIds.size() + " attachment docs");
}
log.warn("Number of content docs: " + numContentDocs + ", number deleted: " + numContentDeletedDocs);
log.warn("Number of attachment docs: " + numAttachmentDocs + ", number deleted: " + numAttachmentDeletedDocs);
if (dirNameToDocIdMap == null)
dirNameToDocIdMap = new LinkedHashMap<>();
} catch (Exception e) {
Util.print_exception(e, log);
}
log.info("Setting up index for read took " + (System.currentTimeMillis() - startTime) + " ms");
}
use of org.apache.lucene.queryparser.classic.MultiFieldQueryParser in project carbondata by apache.
the class LuceneFineGrainIndex method prune.
/**
* Prune the index with filter expression. It returns the list of
* blocklets where these filters can exist.
*/
@Override
public List<FineGrainBlocklet> prune(FilterResolverIntf filterExp, SegmentProperties segmentProperties, FilterExecutor filterExecutor, CarbonTable carbonTable) throws IOException {
// convert filter expr into lucene list query
List<String> fields = new ArrayList<String>();
// only for test , query all data
String strQuery = getQueryString(filterExp.getFilterExpression());
int maxDocs;
try {
maxDocs = getMaxDoc(filterExp.getFilterExpression());
} catch (NumberFormatException e) {
maxDocs = Integer.MAX_VALUE;
}
if (null == strQuery) {
return null;
}
String[] sFields = new String[fields.size()];
fields.toArray(sFields);
// get analyzer
if (analyzer == null) {
analyzer = new StandardAnalyzer();
}
// use MultiFieldQueryParser to parser query
QueryParser queryParser = new MultiFieldQueryParser(sFields, analyzer);
queryParser.setAllowLeadingWildcard(true);
Query query;
try {
query = queryParser.parse(strQuery);
} catch (ParseException e) {
String errorMessage = String.format("failed to filter block with query %s, detail is %s", strQuery, e.getMessage());
LOGGER.error(errorMessage, e);
return null;
}
// temporary data, delete duplicated data
// Map<BlockId, Map<BlockletId, Map<PageId, Set<RowId>>>>
Map<String, Map<Integer, List<Short>>> mapBlocks = new HashMap<>();
long luceneSearchStartTime = System.currentTimeMillis();
for (Map.Entry<String, IndexSearcher> searcherEntry : indexSearcherMap.entrySet()) {
IndexSearcher indexSearcher = searcherEntry.getValue();
// take the min of total documents available in the reader and limit if set by the user
maxDocs = Math.min(maxDocs, indexSearcher.getIndexReader().maxDoc());
// execute index search
TopDocs result = null;
// the number of documents to be queried in one search. It will always be minimum of
// search result and maxDocs
int numberOfDocumentsToBeQueried = 0;
// counter for maintaining the total number of documents finished querying
int documentHitCounter = 0;
try {
numberOfDocumentsToBeQueried = Math.min(maxDocs, SEARCH_LIMIT);
result = indexSearcher.search(query, numberOfDocumentsToBeQueried);
documentHitCounter += numberOfDocumentsToBeQueried;
} catch (IOException e) {
String errorMessage = String.format("failed to search lucene data, detail is %s", e.getMessage());
LOGGER.error(errorMessage, e);
throw new IOException(errorMessage, e);
}
ByteBuffer intBuffer = ByteBuffer.allocate(4);
// last scoreDoc in a result to be used in searchAfter API
ScoreDoc lastScoreDoc = null;
while (true) {
for (ScoreDoc scoreDoc : result.scoreDocs) {
// get a document
Document doc = indexSearcher.doc(scoreDoc.doc);
// get all fields
List<IndexableField> fieldsInDoc = doc.getFields();
if (writeCacheSize > 0) {
// It fills rowids to the map, its value is combined with multiple rows.
fillMapForCombineRows(intBuffer, mapBlocks, fieldsInDoc, searcherEntry.getKey());
} else {
// Fill rowids to the map
fillMap(intBuffer, mapBlocks, fieldsInDoc, searcherEntry.getKey());
}
lastScoreDoc = scoreDoc;
}
// result will have the total number of hits therefore we always need to query on the
// left over documents
int remainingHits = result.totalHits - documentHitCounter;
// break the loop if count reaches maxDocs to be searched or remaining hits become <=0
if (remainingHits <= 0 || documentHitCounter >= maxDocs) {
break;
}
numberOfDocumentsToBeQueried = Math.min(remainingHits, SEARCH_LIMIT);
result = indexSearcher.searchAfter(lastScoreDoc, query, numberOfDocumentsToBeQueried);
documentHitCounter += numberOfDocumentsToBeQueried;
}
}
LOGGER.info("Time taken for lucene search: " + (System.currentTimeMillis() - luceneSearchStartTime) + " ms");
// result blocklets
List<FineGrainBlocklet> blocklets = new ArrayList<>();
// Map<BlockId, Map<BlockletId, Map<PageId, Set<RowId>>>>
for (Map.Entry<String, Map<Integer, List<Short>>> mapBlocklet : mapBlocks.entrySet()) {
String blockletId = mapBlocklet.getKey();
Map<Integer, List<Short>> mapPageIds = mapBlocklet.getValue();
List<FineGrainBlocklet.Page> pages = new ArrayList<FineGrainBlocklet.Page>();
// for pages in this blocklet Map<PageId, Set<RowId>>>
for (Map.Entry<Integer, List<Short>> mapPageId : mapPageIds.entrySet()) {
// construct array rowid
int[] rowIds = new int[mapPageId.getValue().size()];
int i = 0;
// for rowids in this page Set<RowId>
for (Short rowid : mapPageId.getValue()) {
rowIds[i++] = rowid;
}
// construct one page
FineGrainBlocklet.Page page = new FineGrainBlocklet.Page();
page.setPageId(mapPageId.getKey());
page.setRowId(rowIds);
// add this page into list pages
pages.add(page);
}
// add a FineGrainBlocklet
blocklets.add(new FineGrainBlocklet(filePath, blockletId, pages));
}
return blocklets;
}
Aggregations