Search in sources :

Example 1 with MultiFieldQueryParser

use of org.apache.lucene.queryparser.classic.MultiFieldQueryParser in project textdb by TextDB.

the class FuzzyTokenMatcherSourceOperator method createLuceneQueryObject.

public static Query createLuceneQueryObject(FuzzyTokenPredicate predicate) throws DataFlowException {
    try {
        /*
             * By default the boolean query takes 1024 # of clauses as the max
             * limit. Since our input query has no limitaion on the number of
             * tokens, we have to put a check.
             */
        if (predicate.getThreshold() > 1024)
            BooleanQuery.setMaxClauseCount(predicate.getThreshold() + 1);
        BooleanQuery.Builder builder = new BooleanQuery.Builder();
        builder.setMinimumNumberShouldMatch(predicate.getThreshold());
        MultiFieldQueryParser qp = new MultiFieldQueryParser(predicate.getAttributeNames().stream().toArray(String[]::new), LuceneAnalyzerConstants.getLuceneAnalyzer(predicate.getLuceneAnalyzerStr()));
        for (String s : predicate.getQueryTokens()) {
            builder.add(qp.parse(s), Occur.SHOULD);
        }
        return builder.build();
    } catch (ParseException e) {
        throw new DataFlowException(e);
    }
}
Also used : BooleanQuery(org.apache.lucene.search.BooleanQuery) MultiFieldQueryParser(org.apache.lucene.queryparser.classic.MultiFieldQueryParser) DataFlowException(edu.uci.ics.textdb.api.exception.DataFlowException) ParseException(org.apache.lucene.queryparser.classic.ParseException)

Example 2 with MultiFieldQueryParser

use of org.apache.lucene.queryparser.classic.MultiFieldQueryParser in project Anserini by castorini.

the class LookupTopic method search.

/**
   * Prints query results to the standard output stream.
   *
   * @param queryName the entity name to search
   * @throws Exception on error
   */
public void search(String queryName) throws Exception {
    LOG.info("Querying started...");
    // Initialize index searcher
    IndexSearcher searcher = new IndexSearcher(reader);
    SimpleAnalyzer analyzer = new SimpleAnalyzer();
    int numHits = 20;
    // find exact title
    QueryParser titleParser = new QueryParser(TopicLuceneDocumentGenerator.FIELD_TITLE, analyzer);
    Query titleQuery = titleParser.parse(queryName);
    TopDocs rs = searcher.search(titleQuery, numHits);
    ScoredDocuments docs = ScoredDocuments.fromTopDocs(rs, searcher);
    for (int i = 0; i < docs.documents.length; i++) {
        String resultDoc = String.format("%d - SCORE: %f\nTOPIC_MID: %s\nWIKI_TITLE: %s\nW3_LABEL: %s\n\n", (i + 1), docs.scores[i], docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TOPIC_MID).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TITLE).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_LABEL).stringValue());
        System.out.println(resultDoc);
    }
    if (docs.documents.length != 0) {
        System.out.println("Exact WIKI_TITLE found! Ending search.");
        return;
    } else {
        System.out.println("Exact WIKI_TITLE not found. Searching for the label...");
    }
    System.out.println();
    // find exact label
    QueryParser labelParser = new QueryParser(TopicLuceneDocumentGenerator.FIELD_LABEL, analyzer);
    Query labelQuery = labelParser.parse(queryName);
    rs = searcher.search(labelQuery, numHits);
    docs = ScoredDocuments.fromTopDocs(rs, searcher);
    for (int i = 0; i < docs.documents.length; i++) {
        String resultDoc = String.format("%d - SCORE: %f\nTOPIC_MID: %s\nWIKI_TITLE: %s\nW3_LABEL: %s\n\n", (i + 1), docs.scores[i], docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TOPIC_MID).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TITLE).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_LABEL).stringValue());
        System.out.println(resultDoc);
    }
    if (docs.documents.length != 0) {
        System.out.println("Exact W3_LABEL found! Ending search.");
        return;
    } else {
        System.out.println("Exact W3_LABEL not found. Ranking the topics using BM25 according the text/title/label...");
    }
    System.out.println();
    float k1 = 1.5f;
    float b = 0.75f;
    Similarity similarity = new BM25Similarity(k1, b);
    searcher.setSimilarity(similarity);
    MultiFieldQueryParser queryParser = new MultiFieldQueryParser(new String[] { TopicLuceneDocumentGenerator.FIELD_TITLE, TopicLuceneDocumentGenerator.FIELD_LABEL, TopicLuceneDocumentGenerator.FIELD_TEXT }, analyzer);
    queryParser.setDefaultOperator(QueryParser.Operator.OR);
    Query query = queryParser.parse(queryName);
    rs = searcher.search(query, numHits);
    docs = ScoredDocuments.fromTopDocs(rs, searcher);
    for (int i = 0; i < docs.documents.length; i++) {
        String resultDoc = String.format("%d - SCORE: %f\nTOPIC_MID: %s\nWIKI_TITLE: %s\nW3_LABEL: %s\n", (i + 1), docs.scores[i], docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TOPIC_MID).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TITLE).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_LABEL).stringValue());
        System.out.println(resultDoc);
    }
    LOG.info("Querying completed.");
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) TopDocs(org.apache.lucene.search.TopDocs) MultiFieldQueryParser(org.apache.lucene.queryparser.classic.MultiFieldQueryParser) QueryParser(org.apache.lucene.queryparser.classic.QueryParser) Query(org.apache.lucene.search.Query) Similarity(org.apache.lucene.search.similarities.Similarity) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity) MultiFieldQueryParser(org.apache.lucene.queryparser.classic.MultiFieldQueryParser) SimpleAnalyzer(org.apache.lucene.analysis.core.SimpleAnalyzer) ScoredDocuments(io.anserini.rerank.ScoredDocuments) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity)

Example 3 with MultiFieldQueryParser

use of org.apache.lucene.queryparser.classic.MultiFieldQueryParser in project Anserini by castorini.

the class LookupTopic method search.

/**
 * Prints all known facts about a particular mid.
 * @param queryName query topic name
 * @throws Exception on error
 */
public void search(String queryName, int numHits) throws Exception {
    // Initialize index searcher
    IndexSearcher searcher = new IndexSearcher(reader);
    // search for query in multiple fields
    MultiFieldQueryParser queryParser = new MultiFieldQueryParser(new String[] { IndexTopics.FIELD_NAME, IndexTopics.FIELD_LABEL, IndexTopics.FIELD_ALIAS }, new SimpleAnalyzer());
    queryParser.setDefaultOperator(QueryParser.Operator.OR);
    Query query = queryParser.parse(queryName);
    TopDocs rs = searcher.search(query, numHits);
    ScoredDocuments docs = ScoredDocuments.fromTopDocs(rs, searcher);
    for (int i = 0; i < docs.documents.length; i++) {
        String resultDoc = String.format("%d - SCORE: %f\nTOPIC_MID: %s\nOBJECT_NAME: %s\nWIKI_TITLE: %s\nW3_LABEL: %s\n", (i + 1), docs.scores[i], docs.documents[i].getField(IndexTopics.FIELD_TOPIC_MID).stringValue(), docs.documents[i].getField(IndexTopics.FIELD_NAME).stringValue(), docs.documents[i].getField(IndexTopics.FIELD_ALIAS).stringValue(), docs.documents[i].getField(IndexTopics.FIELD_LABEL).stringValue());
        System.out.println(resultDoc);
    }
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) TopDocs(org.apache.lucene.search.TopDocs) MultiFieldQueryParser(org.apache.lucene.queryparser.classic.MultiFieldQueryParser) Query(org.apache.lucene.search.Query) SimpleAnalyzer(org.apache.lucene.analysis.core.SimpleAnalyzer) ScoredDocuments(io.anserini.rerank.ScoredDocuments)

Example 4 with MultiFieldQueryParser

use of org.apache.lucene.queryparser.classic.MultiFieldQueryParser in project carbondata by apache.

the class LuceneFineGrainDataMap method prune.

/**
 * Prune the datamap with filter expression. It returns the list of
 * blocklets where these filters can exist.
 */
@Override
public List<FineGrainBlocklet> prune(FilterResolverIntf filterExp, SegmentProperties segmentProperties, List<PartitionSpec> partitions) throws IOException {
    // convert filter expr into lucene list query
    List<String> fields = new ArrayList<String>();
    // only for test , query all data
    String strQuery = getQueryString(filterExp.getFilterExpression());
    String[] sFields = new String[fields.size()];
    fields.toArray(sFields);
    // get analyzer
    if (analyzer == null) {
        analyzer = new StandardAnalyzer();
    }
    // use MultiFieldQueryParser to parser query
    QueryParser queryParser = new MultiFieldQueryParser(sFields, analyzer);
    Query query;
    try {
        query = queryParser.parse(strQuery);
    } catch (ParseException e) {
        String errorMessage = String.format("failed to filter block with query %s, detail is %s", strQuery, e.getMessage());
        LOGGER.error(errorMessage);
        return null;
    }
    // execute index search
    TopDocs result;
    try {
        result = indexSearcher.search(query, MAX_RESULT_NUMBER);
    } catch (IOException e) {
        String errorMessage = String.format("failed to search lucene data, detail is %s", e.getMessage());
        LOGGER.error(errorMessage);
        throw new IOException(errorMessage);
    }
    // temporary data, delete duplicated data
    // Map<BlockId, Map<BlockletId, Map<PageId, Set<RowId>>>>
    Map<String, Map<String, Map<Integer, Set<Integer>>>> mapBlocks = new HashMap<>();
    for (ScoreDoc scoreDoc : result.scoreDocs) {
        // get a document
        Document doc = indexSearcher.doc(scoreDoc.doc);
        // get all fields
        List<IndexableField> fieldsInDoc = doc.getFields();
        // get this block id Map<BlockId, Map<BlockletId, Map<PageId, Set<RowId>>>>
        String blockId = fieldsInDoc.get(BLOCKID_ID).stringValue();
        Map<String, Map<Integer, Set<Integer>>> mapBlocklets = mapBlocks.get(blockId);
        if (mapBlocklets == null) {
            mapBlocklets = new HashMap<>();
            mapBlocks.put(blockId, mapBlocklets);
        }
        // get the blocklet id Map<BlockletId, Map<PageId, Set<RowId>>>
        String blockletId = fieldsInDoc.get(BLOCKLETID_ID).stringValue();
        Map<Integer, Set<Integer>> mapPageIds = mapBlocklets.get(blockletId);
        if (mapPageIds == null) {
            mapPageIds = new HashMap<>();
            mapBlocklets.put(blockletId, mapPageIds);
        }
        // get the page id Map<PageId, Set<RowId>>
        Number pageId = fieldsInDoc.get(PAGEID_ID).numericValue();
        Set<Integer> setRowId = mapPageIds.get(pageId.intValue());
        if (setRowId == null) {
            setRowId = new HashSet<>();
            mapPageIds.put(pageId.intValue(), setRowId);
        }
        // get the row id Set<RowId>
        Number rowId = fieldsInDoc.get(ROWID_ID).numericValue();
        setRowId.add(rowId.intValue());
    }
    // result blocklets
    List<FineGrainBlocklet> blocklets = new ArrayList<>();
    // Map<BlockId, Map<BlockletId, Map<PageId, Set<RowId>>>>
    for (Map.Entry<String, Map<String, Map<Integer, Set<Integer>>>> mapBlock : mapBlocks.entrySet()) {
        String blockId = mapBlock.getKey();
        Map<String, Map<Integer, Set<Integer>>> mapBlocklets = mapBlock.getValue();
        // for blocklets in this block Map<BlockletId, Map<PageId, Set<RowId>>>
        for (Map.Entry<String, Map<Integer, Set<Integer>>> mapBlocklet : mapBlocklets.entrySet()) {
            String blockletId = mapBlocklet.getKey();
            Map<Integer, Set<Integer>> mapPageIds = mapBlocklet.getValue();
            List<FineGrainBlocklet.Page> pages = new ArrayList<FineGrainBlocklet.Page>();
            // for pages in this blocklet Map<PageId, Set<RowId>>>
            for (Map.Entry<Integer, Set<Integer>> mapPageId : mapPageIds.entrySet()) {
                // construct array rowid
                int[] rowIds = new int[mapPageId.getValue().size()];
                int i = 0;
                // for rowids in this page Set<RowId>
                for (Integer rowid : mapPageId.getValue()) {
                    rowIds[i++] = rowid;
                }
                // construct one page
                FineGrainBlocklet.Page page = new FineGrainBlocklet.Page();
                page.setPageId(mapPageId.getKey());
                page.setRowId(rowIds);
                // add this page into list pages
                pages.add(page);
            }
            // add a FineGrainBlocklet
            blocklets.add(new FineGrainBlocklet(blockId, blockletId, pages));
        }
    }
    return blocklets;
}
Also used : Document(org.apache.lucene.document.Document) FineGrainBlocklet(org.apache.carbondata.core.datamap.dev.fgdatamap.FineGrainBlocklet) MultiFieldQueryParser(org.apache.lucene.queryparser.classic.MultiFieldQueryParser) IOException(java.io.IOException) IndexableField(org.apache.lucene.index.IndexableField) MultiFieldQueryParser(org.apache.lucene.queryparser.classic.MultiFieldQueryParser) QueryParser(org.apache.lucene.queryparser.classic.QueryParser) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) ParseException(org.apache.lucene.queryparser.classic.ParseException) FineGrainDataMap(org.apache.carbondata.core.datamap.dev.fgdatamap.FineGrainDataMap)

Example 5 with MultiFieldQueryParser

use of org.apache.lucene.queryparser.classic.MultiFieldQueryParser in project epadd by ePADD.

the class Highlighter method highlight.

private static String highlight(String content, String term, String preTag, String postTag) throws IOException, ParseException, InvalidTokenOffsetsException {
    // The Lucene Highlighter is used in a hacky way here, it is intended to be used to retrieve fragments from a matching Lucene document.
    // The Lucene Highlighter introduces tags around every token that matched the query, hence it is required to merge these fragmented annotations into one inorder to fit our needs.
    // To truly differentiate contiguous fragments that match a term supplied we add a unique id to the pretag, hence the randum instance
    // TODO: Explain what is happening here
    // Version lv = Indexer.LUCENE_VERSION;
    // hell with reset close, stuff. initialized two analyzers to evade the problem.
    // TODO: get rid of two analyzers.
    Analyzer snAnalyzer, snAnalyzer2;
    snAnalyzer = new EnglishNumberAnalyzer(CharArraySet.EMPTY_SET);
    snAnalyzer2 = new EnglishNumberAnalyzer(CharArraySet.EMPTY_SET);
    Fragmenter fragmenter = new NullFragmenter();
    QueryParser qp = new MultiFieldQueryParser(new String[] { "" }, snAnalyzer2);
    BooleanQuery.Builder querybuilder = new BooleanQuery.Builder();
    TokenStream stream = snAnalyzer.tokenStream(null, new StringReader(content));
    int r = randnum.nextInt();
    String upreTag = preTag.replaceAll(">$", " data-ignore=" + r + " >");
    Formatter formatter = new SimpleHTMLFormatter(upreTag, postTag);
    // Parse exception may occur while parsing terms like "AND", "OR" etc.
    try {
        querybuilder.add(new BooleanClause(qp.parse(term), BooleanClause.Occur.SHOULD));
    } catch (ParseException pe) {
        if (log.isDebugEnabled())
            log.debug("Exception while parsing: " + term, pe);
        return content;
    }
    Scorer scorer = new QueryScorer(querybuilder.build());
    org.apache.lucene.search.highlight.Highlighter highlighter = new org.apache.lucene.search.highlight.Highlighter(formatter, scorer);
    highlighter.setTextFragmenter(fragmenter);
    highlighter.setMaxDocCharsToAnalyze(Math.max(org.apache.lucene.search.highlight.Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE, content.length()));
    String result = highlighter.getBestFragment(stream, content);
    snAnalyzer.close();
    snAnalyzer2.close();
    if (result != null) {
        result = mergeContiguousFragments(result, term, upreTag, postTag);
        // and then remove the extra info. we appended to the tags
        result = result.replaceAll(" data-ignore=" + r + " >", ">");
        return result;
    } else
        return content;
}
Also used : BooleanQuery(org.apache.lucene.search.BooleanQuery) TokenStream(org.apache.lucene.analysis.TokenStream) Formatter(org.apache.lucene.search.highlight.Formatter) Analyzer(org.apache.lucene.analysis.Analyzer) org.apache.lucene.search.highlight(org.apache.lucene.search.highlight) StringReader(java.io.StringReader) MultiFieldQueryParser(org.apache.lucene.queryparser.classic.MultiFieldQueryParser) BooleanClause(org.apache.lucene.search.BooleanClause) MultiFieldQueryParser(org.apache.lucene.queryparser.classic.MultiFieldQueryParser) QueryParser(org.apache.lucene.queryparser.classic.QueryParser) ParseException(org.apache.lucene.queryparser.classic.ParseException)

Aggregations

MultiFieldQueryParser (org.apache.lucene.queryparser.classic.MultiFieldQueryParser)32 ParseException (org.apache.lucene.queryparser.classic.ParseException)19 Query (org.apache.lucene.search.Query)15 StandardAnalyzer (org.apache.lucene.analysis.standard.StandardAnalyzer)13 TermQuery (org.apache.lucene.search.TermQuery)10 WildcardQuery (org.apache.lucene.search.WildcardQuery)10 QueryParser (org.apache.lucene.queryparser.classic.QueryParser)9 IndexSearcher (org.apache.lucene.search.IndexSearcher)9 ModuleException (it.vige.rubia.ModuleException)8 ResultPage (it.vige.rubia.search.ResultPage)8 Searching (it.vige.rubia.search.Searching)8 SortBy (it.vige.rubia.search.SortBy)8 SortOrder (it.vige.rubia.search.SortOrder)8 IOException (java.io.IOException)8 EntityManager (javax.persistence.EntityManager)8 Term (org.apache.lucene.index.Term)8 Builder (org.apache.lucene.search.BooleanQuery.Builder)8 FullTextQuery (org.hibernate.search.FullTextQuery)8 FullTextSession (org.hibernate.search.FullTextSession)8 Search.getFullTextSession (org.hibernate.search.Search.getFullTextSession)8