Search in sources :

Example 31 with MultiFieldQueryParser

use of org.apache.lucene.queryparser.classic.MultiFieldQueryParser in project textdb by TextDB.

the class RegexMatcherSourceOperator method createLuceneQuery.

public static Query createLuceneQuery(RegexSourcePredicate predicate) throws StorageException {
    Query luceneQuery;
    String queryString;
    // Try to apply translator. If it fails, use scan query.
    try {
        queryString = RegexToGramQueryTranslator.translate(predicate.getRegex()).getLuceneQueryString();
    } catch (com.google.re2j.PatternSyntaxException e) {
        queryString = DataflowUtils.LUCENE_SCAN_QUERY;
    }
    // Try to parse the query string. It if fails, raise an exception.
    try {
        luceneQuery = new MultiFieldQueryParser(predicate.getAttributeNames().stream().toArray(String[]::new), RelationManager.getInstance().getTableAnalyzer(predicate.getTableName())).parse(queryString);
    } catch (ParseException e) {
        throw new StorageException(e);
    }
    return luceneQuery;
}
Also used : Query(org.apache.lucene.search.Query) MatchAllDocsQuery(org.apache.lucene.search.MatchAllDocsQuery) MultiFieldQueryParser(org.apache.lucene.queryparser.classic.MultiFieldQueryParser) ParseException(org.apache.lucene.queryparser.classic.ParseException) StorageException(edu.uci.ics.texera.api.exception.StorageException)

Example 32 with MultiFieldQueryParser

use of org.apache.lucene.queryparser.classic.MultiFieldQueryParser in project SSM by Intel-bigdata.

the class LuceneSearch method query.

/* (non-Javadoc)
   * @see org.apache.zeppelin.search.Search#query(java.lang.String)
   */
@Override
public List<Map<String, String>> query(String queryStr) {
    if (null == ramDirectory) {
        throw new IllegalStateException("Something went wrong on instance creation time, index dir is null");
    }
    List<Map<String, String>> result = Collections.emptyList();
    try (IndexReader indexReader = DirectoryReader.open(ramDirectory)) {
        IndexSearcher indexSearcher = new IndexSearcher(indexReader);
        Analyzer analyzer = new StandardAnalyzer();
        MultiFieldQueryParser parser = new MultiFieldQueryParser(new String[] { SEARCH_FIELD_TEXT, SEARCH_FIELD_TITLE }, analyzer);
        Query query = parser.parse(queryStr);
        LOG.debug("Searching for: " + query.toString(SEARCH_FIELD_TEXT));
        SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter();
        Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query));
        result = doSearch(indexSearcher, query, analyzer, highlighter);
        indexReader.close();
    } catch (IOException e) {
        LOG.error("Failed to open index dir {}, make sure indexing finished OK", ramDirectory, e);
    } catch (ParseException e) {
        LOG.error("Failed to parse query " + queryStr, e);
    }
    return result;
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) MultiFieldQueryParser(org.apache.lucene.queryparser.classic.MultiFieldQueryParser) Query(org.apache.lucene.search.Query) WildcardQuery(org.apache.lucene.search.WildcardQuery) QueryScorer(org.apache.lucene.search.highlight.QueryScorer) IOException(java.io.IOException) Analyzer(org.apache.lucene.analysis.Analyzer) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) IndexReader(org.apache.lucene.index.IndexReader) ParseException(org.apache.lucene.queryparser.classic.ParseException) SimpleHTMLFormatter(org.apache.lucene.search.highlight.SimpleHTMLFormatter) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) Highlighter(org.apache.lucene.search.highlight.Highlighter)

Example 33 with MultiFieldQueryParser

use of org.apache.lucene.queryparser.classic.MultiFieldQueryParser in project neo4j by neo4j.

the class FulltextIndexReader method parseFulltextQuery.

private Query parseFulltextQuery(String query) throws ParseException {
    MultiFieldQueryParser multiFieldQueryParser = new MultiFieldQueryParser(propertyNames, analyzer);
    multiFieldQueryParser.setAllowLeadingWildcard(true);
    return multiFieldQueryParser.parse(query);
}
Also used : MultiFieldQueryParser(org.apache.lucene.queryparser.classic.MultiFieldQueryParser)

Example 34 with MultiFieldQueryParser

use of org.apache.lucene.queryparser.classic.MultiFieldQueryParser in project epadd by ePADD.

the class Indexer method setupForRead.

/**
 * sets up indexer just for reading... if needed for writing only, call
 * setupForWrite. if need both read & write, call both.
 */
synchronized void setupForRead() {
    log.info("setting up index for read only access");
    long startTime = System.currentTimeMillis();
    // closeHandles();
    try {
        setupDirectory();
        String[] defaultSearchFields, defaultSearchFieldsOriginal;
        // for subject only search
        String[] defaultSearchFieldSubject = new String[] { "title" };
        String[] defaultSearchFieldCorrespondents;
        // For searching in the entity field of the document.
        String[] defaultSearchFieldEntities = new String[] { "names", "en_names_title" };
        // body field should be there, as the content of the attachment lies in this field, should also include meta field?
        // why the search over en-names and en-names-original when body/body_original is included in the search fields?
        defaultSearchFields = new String[] { "body", "title", "to_names", "from_names", "cc_names", "bcc_names", "to_emails", "from_emails", "cc_emails", "bcc_emails" };
        // we want to leave title there because we want to always hit the title -- discussed with Peter June 27 2015
        defaultSearchFieldsOriginal = new String[] { "body_original", "title" };
        defaultSearchFieldCorrespondents = new String[] { "to_names", "from_names", "cc_names", "bcc_names", "to_emails", "from_emails", "cc_emails", "bcc_emails" };
        // names field added above after email discussion with Sit 6/11/2013. problem is that we're not using the Lucene EnglishPossessiveFilter, so
        // openNLPNER will extract the name Stanford University in a sentence like:
        // "This is Stanford University's website."
        // but when the user clicks on the name "Stanford University" in say monthly cards, we
        // will not match the message with this sentence because of the apostrophe.
        // for searching an attchment with fileName
        String[] metaSearchFields = new String[] { "fileName" };
        // Parse a simple query that searches for "text":
        if (parser == null) {
            // parser = new QueryParser(MUSE_LUCENE_VERSION, defaultSearchField, analyzer);
            parser = new MultiFieldQueryParser(defaultSearchFields, analyzer);
            // parserEntityFields = new MultiFieldQueryParser(defaultSearchFieldEntities);
            parserOriginal = new MultiFieldQueryParser(defaultSearchFieldsOriginal, analyzer);
            parserSubject = new MultiFieldQueryParser(defaultSearchFieldSubject, analyzer);
            parserCorrespondents = new MultiFieldQueryParser(defaultSearchFieldCorrespondents, analyzer);
            parserMeta = new MultiFieldQueryParser(metaSearchFields, new KeywordAnalyzer());
        }
        /**
         * Bunch of gotchas here
         * Its a bad idea to store lucene internal docIds, as no assumptions about the internal docIds should be made;
         * not even that they are serial. When searching, lucene may ignore logically deleted docs.
         * Lucene does not handle deleted docs, and having these docs in search may bring down the search performance by 50%
         * Deleted docs are cleaned only during merging of indices.
         */
        int numContentDocs = 0, numContentDeletedDocs = 0, numAttachmentDocs = 0, numAttachmentDeletedDocs = 0;
        if (DirectoryReader.indexExists(directory)) {
            DirectoryReader ireader = DirectoryReader.open(directory);
            if (ireader.numDeletedDocs() > 0)
                log.warn("!!!!!!!\nIndex reader has " + ireader.numDocs() + " doc(s) of which " + ireader.numDeletedDocs() + " are deleted)\n!!!!!!!!!!");
            isearcher = new IndexSearcher(ireader);
            contentDocIds = new LinkedHashMap<>();
            numContentDocs = ireader.numDocs();
            numContentDeletedDocs = ireader.numDeletedDocs();
            Bits liveDocs = MultiFields.getLiveDocs(ireader);
            Set<String> fieldsToLoad = new HashSet<>();
            fieldsToLoad.add("docId");
            /*for(int i=0;i<ireader.maxDoc();i++){
					org.apache.lucene.document.Document doc = ireader.document(i,fieldsToLoad);
                    if(liveDocs!=null && !liveDocs.get(i))
                        continue;

                    if(doc == null || doc.get("docId") == null)
						continue;
					contentDocIds.put(i, doc.get("docId"));
				}*/
            log.info("Loaded: " + contentDocIds.size() + " content docs");
        }
        if (DirectoryReader.indexExists(directory_blob)) {
            IndexReader ireader_blob = DirectoryReader.open(directory_blob);
            // read-only=true
            isearcher_blob = new IndexSearcher(ireader_blob);
            blobDocIds = new LinkedHashMap<>();
            numAttachmentDocs = ireader_blob.numDocs();
            numAttachmentDeletedDocs = ireader_blob.numDeletedDocs();
            Bits liveDocs = MultiFields.getLiveDocs(ireader_blob);
            Set<String> fieldsToLoad = new HashSet<>();
            fieldsToLoad.add("docId");
            /*for(int i=0;i<ireader_blob.maxDoc();i++){
					org.apache.lucene.document.Document doc = ireader_blob.document(i,fieldsToLoad);
                    if(liveDocs!=null && !liveDocs.get(i))
                        continue;

					if(doc == null || doc.get("docId") == null)
						continue;
					blobDocIds.put(i,doc.get("docId"));
                }*/
            log.info("Loaded: " + blobDocIds.size() + " attachment docs");
        }
        log.warn("Number of content docs: " + numContentDocs + ", number deleted: " + numContentDeletedDocs);
        log.warn("Number of attachment docs: " + numAttachmentDocs + ", number deleted: " + numAttachmentDeletedDocs);
        if (dirNameToDocIdMap == null)
            dirNameToDocIdMap = new LinkedHashMap<>();
    } catch (Exception e) {
        Util.print_exception(e, log);
    }
    log.info("Setting up index for read took " + (System.currentTimeMillis() - startTime) + " ms");
}
Also used : KeywordAnalyzer(org.apache.lucene.analysis.core.KeywordAnalyzer) MultiFieldQueryParser(org.apache.lucene.queryparser.classic.MultiFieldQueryParser) ParseException(org.apache.lucene.queryparser.classic.ParseException) GeneralSecurityException(java.security.GeneralSecurityException) Bits(org.apache.lucene.util.Bits)

Example 35 with MultiFieldQueryParser

use of org.apache.lucene.queryparser.classic.MultiFieldQueryParser in project carbondata by apache.

the class LuceneFineGrainIndex method prune.

/**
 * Prune the index with filter expression. It returns the list of
 * blocklets where these filters can exist.
 */
@Override
public List<FineGrainBlocklet> prune(FilterResolverIntf filterExp, SegmentProperties segmentProperties, FilterExecutor filterExecutor, CarbonTable carbonTable) throws IOException {
    // convert filter expr into lucene list query
    List<String> fields = new ArrayList<String>();
    // only for test , query all data
    String strQuery = getQueryString(filterExp.getFilterExpression());
    int maxDocs;
    try {
        maxDocs = getMaxDoc(filterExp.getFilterExpression());
    } catch (NumberFormatException e) {
        maxDocs = Integer.MAX_VALUE;
    }
    if (null == strQuery) {
        return null;
    }
    String[] sFields = new String[fields.size()];
    fields.toArray(sFields);
    // get analyzer
    if (analyzer == null) {
        analyzer = new StandardAnalyzer();
    }
    // use MultiFieldQueryParser to parser query
    QueryParser queryParser = new MultiFieldQueryParser(sFields, analyzer);
    queryParser.setAllowLeadingWildcard(true);
    Query query;
    try {
        query = queryParser.parse(strQuery);
    } catch (ParseException e) {
        String errorMessage = String.format("failed to filter block with query %s, detail is %s", strQuery, e.getMessage());
        LOGGER.error(errorMessage, e);
        return null;
    }
    // temporary data, delete duplicated data
    // Map<BlockId, Map<BlockletId, Map<PageId, Set<RowId>>>>
    Map<String, Map<Integer, List<Short>>> mapBlocks = new HashMap<>();
    long luceneSearchStartTime = System.currentTimeMillis();
    for (Map.Entry<String, IndexSearcher> searcherEntry : indexSearcherMap.entrySet()) {
        IndexSearcher indexSearcher = searcherEntry.getValue();
        // take the min of total documents available in the reader and limit if set by the user
        maxDocs = Math.min(maxDocs, indexSearcher.getIndexReader().maxDoc());
        // execute index search
        TopDocs result = null;
        // the number of documents to be queried in one search. It will always be minimum of
        // search result and maxDocs
        int numberOfDocumentsToBeQueried = 0;
        // counter for maintaining the total number of documents finished querying
        int documentHitCounter = 0;
        try {
            numberOfDocumentsToBeQueried = Math.min(maxDocs, SEARCH_LIMIT);
            result = indexSearcher.search(query, numberOfDocumentsToBeQueried);
            documentHitCounter += numberOfDocumentsToBeQueried;
        } catch (IOException e) {
            String errorMessage = String.format("failed to search lucene data, detail is %s", e.getMessage());
            LOGGER.error(errorMessage, e);
            throw new IOException(errorMessage, e);
        }
        ByteBuffer intBuffer = ByteBuffer.allocate(4);
        // last scoreDoc in a result to be used in searchAfter API
        ScoreDoc lastScoreDoc = null;
        while (true) {
            for (ScoreDoc scoreDoc : result.scoreDocs) {
                // get a document
                Document doc = indexSearcher.doc(scoreDoc.doc);
                // get all fields
                List<IndexableField> fieldsInDoc = doc.getFields();
                if (writeCacheSize > 0) {
                    // It fills rowids to the map, its value is combined with multiple rows.
                    fillMapForCombineRows(intBuffer, mapBlocks, fieldsInDoc, searcherEntry.getKey());
                } else {
                    // Fill rowids to the map
                    fillMap(intBuffer, mapBlocks, fieldsInDoc, searcherEntry.getKey());
                }
                lastScoreDoc = scoreDoc;
            }
            // result will have the total number of hits therefore we always need to query on the
            // left over documents
            int remainingHits = result.totalHits - documentHitCounter;
            // break the loop if count reaches maxDocs to be searched or remaining hits become <=0
            if (remainingHits <= 0 || documentHitCounter >= maxDocs) {
                break;
            }
            numberOfDocumentsToBeQueried = Math.min(remainingHits, SEARCH_LIMIT);
            result = indexSearcher.searchAfter(lastScoreDoc, query, numberOfDocumentsToBeQueried);
            documentHitCounter += numberOfDocumentsToBeQueried;
        }
    }
    LOGGER.info("Time taken for lucene search: " + (System.currentTimeMillis() - luceneSearchStartTime) + " ms");
    // result blocklets
    List<FineGrainBlocklet> blocklets = new ArrayList<>();
    // Map<BlockId, Map<BlockletId, Map<PageId, Set<RowId>>>>
    for (Map.Entry<String, Map<Integer, List<Short>>> mapBlocklet : mapBlocks.entrySet()) {
        String blockletId = mapBlocklet.getKey();
        Map<Integer, List<Short>> mapPageIds = mapBlocklet.getValue();
        List<FineGrainBlocklet.Page> pages = new ArrayList<FineGrainBlocklet.Page>();
        // for pages in this blocklet Map<PageId, Set<RowId>>>
        for (Map.Entry<Integer, List<Short>> mapPageId : mapPageIds.entrySet()) {
            // construct array rowid
            int[] rowIds = new int[mapPageId.getValue().size()];
            int i = 0;
            // for rowids in this page Set<RowId>
            for (Short rowid : mapPageId.getValue()) {
                rowIds[i++] = rowid;
            }
            // construct one page
            FineGrainBlocklet.Page page = new FineGrainBlocklet.Page();
            page.setPageId(mapPageId.getKey());
            page.setRowId(rowIds);
            // add this page into list pages
            pages.add(page);
        }
        // add a FineGrainBlocklet
        blocklets.add(new FineGrainBlocklet(filePath, blockletId, pages));
    }
    return blocklets;
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) Query(org.apache.lucene.search.Query) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Document(org.apache.lucene.document.Document) ScoreDoc(org.apache.lucene.search.ScoreDoc) TopDocs(org.apache.lucene.search.TopDocs) ArrayList(java.util.ArrayList) List(java.util.List) FineGrainBlocklet(org.apache.carbondata.core.index.dev.fgindex.FineGrainBlocklet) MultiFieldQueryParser(org.apache.lucene.queryparser.classic.MultiFieldQueryParser) IOException(java.io.IOException) ByteBuffer(java.nio.ByteBuffer) IndexableField(org.apache.lucene.index.IndexableField) MultiFieldQueryParser(org.apache.lucene.queryparser.classic.MultiFieldQueryParser) QueryParser(org.apache.lucene.queryparser.classic.QueryParser) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) ParseException(org.apache.lucene.queryparser.classic.ParseException) HashMap(java.util.HashMap) Map(java.util.Map)

Aggregations

MultiFieldQueryParser (org.apache.lucene.queryparser.classic.MultiFieldQueryParser)37 ParseException (org.apache.lucene.queryparser.classic.ParseException)24 Query (org.apache.lucene.search.Query)17 StandardAnalyzer (org.apache.lucene.analysis.standard.StandardAnalyzer)14 QueryParser (org.apache.lucene.queryparser.classic.QueryParser)14 Term (org.apache.lucene.index.Term)11 IndexSearcher (org.apache.lucene.search.IndexSearcher)11 TermQuery (org.apache.lucene.search.TermQuery)11 IOException (java.io.IOException)10 WildcardQuery (org.apache.lucene.search.WildcardQuery)10 ModuleException (it.vige.rubia.ModuleException)8 ResultPage (it.vige.rubia.search.ResultPage)8 Searching (it.vige.rubia.search.Searching)8 SortBy (it.vige.rubia.search.SortBy)8 SortOrder (it.vige.rubia.search.SortOrder)8 EntityManager (javax.persistence.EntityManager)8 Builder (org.apache.lucene.search.BooleanQuery.Builder)8 TopDocs (org.apache.lucene.search.TopDocs)8 FullTextQuery (org.hibernate.search.FullTextQuery)8 FullTextSession (org.hibernate.search.FullTextSession)8