Search in sources :

Example 6 with TextFragment

use of org.apache.lucene.search.highlight.TextFragment in project SearchServices by Alfresco.

the class AlfrescoSolrHighlighter method doHighlightingByHighlighter.

/**
 * Highlights and returns the highlight object for this field -- a String[] by default. Null if none.
 */
@SuppressWarnings("unchecked")
protected Object doHighlightingByHighlighter(Document doc, int docId, SchemaField schemaField, Query query, IndexReader reader, SolrQueryRequest req) throws IOException {
    final SolrParams params = req.getParams();
    final String fieldName = schemaField.getName();
    final int mvToExamine = params.getFieldInt(fieldName, HighlightParams.MAX_MULTIVALUED_TO_EXAMINE, (schemaField.multiValued()) ? Integer.MAX_VALUE : 1);
    // Technically this is the max *fragments* (snippets), not max values:
    int mvToMatch = params.getFieldInt(fieldName, HighlightParams.MAX_MULTIVALUED_TO_MATCH, Integer.MAX_VALUE);
    if (mvToExamine <= 0 || mvToMatch <= 0) {
        return null;
    }
    int maxCharsToAnalyze = params.getFieldInt(fieldName, HighlightParams.MAX_CHARS, DEFAULT_MAX_CHARS);
    if (maxCharsToAnalyze < 0) {
        // e.g. -1
        maxCharsToAnalyze = Integer.MAX_VALUE;
    }
    List<String> fieldValues = getFieldValues(doc, fieldName, mvToExamine, maxCharsToAnalyze, req);
    if (fieldValues.isEmpty()) {
        return null;
    }
    // preserve order of values in a multiValued list
    boolean preserveMulti = params.getFieldBool(fieldName, HighlightParams.PRESERVE_MULTI, false);
    int numFragments = getMaxSnippets(fieldName, params);
    boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params);
    List<TextFragment> frags = new ArrayList<>();
    // Try term vectors, which is faster
    // note: offsets are minimally sufficient for this HL.
    final Fields tvFields = schemaField.storeTermOffsets() ? reader.getTermVectors(docId) : null;
    final TokenStream tvStream = TokenSources.getTermVectorTokenStreamOrNull(fieldName, tvFields, maxCharsToAnalyze - 1);
    // We need to wrap in OffsetWindowTokenFilter if multi-valued
    final OffsetWindowTokenFilter tvWindowStream;
    if (tvStream != null && fieldValues.size() > 1) {
        tvWindowStream = new OffsetWindowTokenFilter(tvStream);
    } else {
        tvWindowStream = null;
    }
    for (String thisText : fieldValues) {
        if (mvToMatch <= 0 || maxCharsToAnalyze <= 0) {
            break;
        }
        TokenStream tstream;
        if (tvWindowStream != null) {
            // if we have a multi-valued field with term vectors, then get the next offset window
            tstream = tvWindowStream.advanceToNextWindowOfLength(thisText.length());
        } else if (tvStream != null) {
            // single-valued with term vectors
            tstream = tvStream;
        } else {
            // fall back to analyzer
            tstream = createAnalyzerTStream(schemaField, thisText);
        }
        Highlighter highlighter;
        if (params.getFieldBool(fieldName, HighlightParams.USE_PHRASE_HIGHLIGHTER, true)) {
            // We're going to call getPhraseHighlighter and it might consume the tokenStream. If it does, the tokenStream
            // needs to implement reset() efficiently.
            // If the tokenStream is right from the term vectors, then CachingTokenFilter is unnecessary.
            // It should be okay if OffsetLimit won't get applied in this case.
            final TokenStream tempTokenStream;
            if (tstream != tvStream) {
                if (maxCharsToAnalyze >= thisText.length()) {
                    tempTokenStream = new CachingTokenFilter(tstream);
                } else {
                    tempTokenStream = new CachingTokenFilter(new OffsetLimitTokenFilter(tstream, maxCharsToAnalyze));
                }
            } else {
                tempTokenStream = tstream;
            }
            // get highlighter
            highlighter = getPhraseHighlighter(query, fieldName, req, tempTokenStream);
            // if the CachingTokenFilter was consumed then use it going forward.
            if (tempTokenStream instanceof CachingTokenFilter && ((CachingTokenFilter) tempTokenStream).isCached()) {
                tstream = tempTokenStream;
            }
        // tstream.reset(); not needed; getBestTextFragments will reset it.
        } else {
            // use "the old way"
            highlighter = getHighlighter(query, fieldName, req);
        }
        highlighter.setMaxDocCharsToAnalyze(maxCharsToAnalyze);
        maxCharsToAnalyze -= thisText.length();
        // Highlight!
        try {
            TextFragment[] bestTextFragments = highlighter.getBestTextFragments(tstream, fixLocalisedText(thisText), mergeContiguousFragments, numFragments);
            for (TextFragment bestTextFragment : bestTextFragments) {
                if (// can happen via mergeContiguousFragments
                bestTextFragment == null)
                    continue;
                // normally we want a score (must be highlighted), but if preserveMulti then we return a snippet regardless.
                if (bestTextFragment.getScore() > 0 || preserveMulti) {
                    frags.add(bestTextFragment);
                    if (bestTextFragment.getScore() > 0)
                        // note: limits fragments (for multi-valued fields), not quite the number of values
                        --mvToMatch;
                }
            }
        } catch (InvalidTokenOffsetsException e) {
            throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
        }
    }
    // Put the fragments onto the Solr response (docSummaries)
    if (frags.size() > 0) {
        // sort such that the fragments with the highest score come first
        if (!preserveMulti) {
            Collections.sort(frags, (arg0, arg1) -> Float.compare(arg1.getScore(), arg0.getScore()));
        }
        // Truncate list to hl.snippets, but not when hl.preserveMulti
        if (frags.size() > numFragments && !preserveMulti) {
            frags = frags.subList(0, numFragments);
        }
        return getResponseForFragments(frags, req);
    }
    // no highlights for this field
    return null;
}
Also used : OffsetLimitTokenFilter(org.apache.lucene.search.highlight.OffsetLimitTokenFilter) TokenStream(org.apache.lucene.analysis.TokenStream) ArrayList(java.util.ArrayList) TextFragment(org.apache.lucene.search.highlight.TextFragment) Fields(org.apache.lucene.index.Fields) CachingTokenFilter(org.apache.lucene.analysis.CachingTokenFilter) InvalidTokenOffsetsException(org.apache.lucene.search.highlight.InvalidTokenOffsetsException) SolrParams(org.apache.solr.common.params.SolrParams) ModifiableSolrParams(org.apache.solr.common.params.ModifiableSolrParams) SolrException(org.apache.solr.common.SolrException) Highlighter(org.apache.lucene.search.highlight.Highlighter) FastVectorHighlighter(org.apache.lucene.search.vectorhighlight.FastVectorHighlighter) SolrHighlighter(org.apache.solr.highlight.SolrHighlighter) DefaultSolrHighlighter(org.apache.solr.highlight.DefaultSolrHighlighter)

Example 7 with TextFragment

use of org.apache.lucene.search.highlight.TextFragment in project jena by apache.

the class TextIndexLucene method frags2string.

private String frags2string(final TextFragment[] frags, final HighlightOpts opts) {
    final StringBuilder sb = new StringBuilder();
    String sep = "";
    for (final TextFragment f : frags) {
        final String fragStr = f.toString();
        log.trace("found fragment {}", f);
        sb.append(sep);
        sb.append(opts.joinHi ? fragStr.replaceAll(opts.patternExpr, "$1") : fragStr);
        sep = opts.fragSep;
    }
    return sb.toString();
}
Also used : TextFragment(org.apache.lucene.search.highlight.TextFragment)

Example 8 with TextFragment

use of org.apache.lucene.search.highlight.TextFragment in project zeppelin by apache.

the class LuceneSearch method doSearch.

private List<Map<String, String>> doSearch(IndexSearcher searcher, Query query, Analyzer analyzer, Highlighter highlighter) {
    List<Map<String, String>> matchingParagraphs = new ArrayList<>();
    ScoreDoc[] hits;
    try {
        hits = searcher.search(query, 20).scoreDocs;
        for (int i = 0; i < hits.length; i++) {
            LOGGER.debug("doc={} score={}", hits[i].doc, hits[i].score);
            int id = hits[i].doc;
            Document doc = searcher.doc(id);
            String path = doc.get(ID_FIELD);
            if (path != null) {
                LOGGER.debug("{}. {}", (i + 1), path);
                String title = doc.get("title");
                if (title != null) {
                    LOGGER.debug("   Title: {}", doc.get("title"));
                }
                String text = doc.get(SEARCH_FIELD_TEXT);
                String header = doc.get(SEARCH_FIELD_TITLE);
                String fragment = "";
                if (text != null) {
                    TokenStream tokenStream = TokenSources.getTokenStream(searcher.getIndexReader(), id, SEARCH_FIELD_TEXT, analyzer);
                    TextFragment[] frags = highlighter.getBestTextFragments(tokenStream, text, true, 3);
                    LOGGER.debug("    {} fragments found for query '{}'", frags.length, query);
                    for (TextFragment frag : frags) {
                        if ((frag != null) && (frag.getScore() > 0)) {
                            LOGGER.debug("    Fragment: {}", frag);
                        }
                    }
                    fragment = (frags != null && frags.length > 0) ? frags[0].toString() : "";
                }
                if (header != null) {
                    TokenStream tokenTitle = TokenSources.getTokenStream(searcher.getIndexReader(), id, SEARCH_FIELD_TITLE, analyzer);
                    TextFragment[] frgTitle = highlighter.getBestTextFragments(tokenTitle, header, true, 3);
                    header = (frgTitle != null && frgTitle.length > 0) ? frgTitle[0].toString() : "";
                } else {
                    header = "";
                }
                matchingParagraphs.add(ImmutableMap.of(// <noteId>/paragraph/<paragraphId>
                "id", // <noteId>/paragraph/<paragraphId>
                path, "name", title, "snippet", fragment, "text", text, "header", header));
            } else {
                LOGGER.info("{}. No {} for this document", i + 1, ID_FIELD);
            }
        }
    } catch (IOException | InvalidTokenOffsetsException e) {
        LOGGER.error("Exception on searching for {}", query, e);
    }
    return matchingParagraphs;
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) ArrayList(java.util.ArrayList) IOException(java.io.IOException) Document(org.apache.lucene.document.Document) TextFragment(org.apache.lucene.search.highlight.TextFragment) LongPoint(org.apache.lucene.document.LongPoint) ScoreDoc(org.apache.lucene.search.ScoreDoc) InvalidTokenOffsetsException(org.apache.lucene.search.highlight.InvalidTokenOffsetsException) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap)

Example 9 with TextFragment

use of org.apache.lucene.search.highlight.TextFragment in project SSM by Intel-bigdata.

the class LuceneSearch method doSearch.

private List<Map<String, String>> doSearch(IndexSearcher searcher, Query query, Analyzer analyzer, Highlighter highlighter) {
    List<Map<String, String>> matchingParagraphs = Lists.newArrayList();
    ScoreDoc[] hits;
    try {
        hits = searcher.search(query, 20).scoreDocs;
        for (int i = 0; i < hits.length; i++) {
            LOG.debug("doc={} score={}", hits[i].doc, hits[i].score);
            int id = hits[i].doc;
            Document doc = searcher.doc(id);
            String path = doc.get(ID_FIELD);
            if (path != null) {
                LOG.debug((i + 1) + ". " + path);
                String title = doc.get("title");
                if (title != null) {
                    LOG.debug("   Title: {}", doc.get("title"));
                }
                String text = doc.get(SEARCH_FIELD_TEXT);
                String header = doc.get(SEARCH_FIELD_TITLE);
                String fragment = "";
                if (text != null) {
                    TokenStream tokenStream = TokenSources.getTokenStream(searcher.getIndexReader(), id, SEARCH_FIELD_TEXT, analyzer);
                    TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, text, true, 3);
                    LOG.debug("    {} fragments found for query '{}'", frag.length, query);
                    for (int j = 0; j < frag.length; j++) {
                        if ((frag[j] != null) && (frag[j].getScore() > 0)) {
                            LOG.debug("    Fragment: {}", frag[j].toString());
                        }
                    }
                    fragment = (frag != null && frag.length > 0) ? frag[0].toString() : "";
                }
                if (header != null) {
                    TokenStream tokenTitle = TokenSources.getTokenStream(searcher.getIndexReader(), id, SEARCH_FIELD_TITLE, analyzer);
                    TextFragment[] frgTitle = highlighter.getBestTextFragments(tokenTitle, header, true, 3);
                    header = (frgTitle != null && frgTitle.length > 0) ? frgTitle[0].toString() : "";
                } else {
                    header = "";
                }
                matchingParagraphs.add(// <noteId>/paragraph/<paragraphId>
                ImmutableMap.of(// <noteId>/paragraph/<paragraphId>
                "id", // <noteId>/paragraph/<paragraphId>
                path, "name", title, "snippet", fragment, "text", text, "header", header));
            } else {
                LOG.info("{}. No {} for this document", i + 1, ID_FIELD);
            }
        }
    } catch (IOException | InvalidTokenOffsetsException e) {
        LOG.error("Exception on searching for {}", query, e);
    }
    return matchingParagraphs;
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) IOException(java.io.IOException) Document(org.apache.lucene.document.Document) TextFragment(org.apache.lucene.search.highlight.TextFragment) ScoreDoc(org.apache.lucene.search.ScoreDoc) InvalidTokenOffsetsException(org.apache.lucene.search.highlight.InvalidTokenOffsetsException) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap)

Example 10 with TextFragment

use of org.apache.lucene.search.highlight.TextFragment in project jena by apache.

the class TextIndexLucene method highlightResults.

private List<TextHit> highlightResults(ScoreDoc[] sDocs, IndexSearcher indexSearcher, Query query, List<String> fields, String highlight, String queryLang) throws IOException, InvalidTokenOffsetsException {
    List<TextHit> results = new ArrayList<>();
    HighlightOpts opts = new HighlightOpts(highlight);
    SimpleHTMLFormatter formatter = new SimpleHTMLFormatter(opts.start, opts.end);
    Highlighter highlighter = new Highlighter(formatter, new QueryScorer(query));
    highlighter.setTextFragmenter(new SimpleFragmenter(opts.fragSize));
    for (ScoreDoc sd : sDocs) {
        Document doc = indexSearcher.doc(sd.doc);
        String entity = doc.get(docDef.getEntityField());
        Node literal = null;
        String field = getDocField(doc, fields);
        String lexical = doc.get(field);
        Collection<Node> props = docDef.getPredicates(field);
        // pick one - should be only one normally
        Node prop = props.isEmpty() ? null : props.iterator().next();
        String docLang = doc.get(docDef.getLangField());
        String effectiveField = queryLang != null ? field + "_" + Util.getEffectiveLang(docLang, queryLang) : field;
        log.trace("highlightResults[{}]: {}, field: {}, lexical: {}, docLang: {}, effectiveField: {}", sd.doc, doc, field, lexical, docLang, effectiveField);
        if (lexical != null) {
            TokenStream tokenStream = indexAnalyzer.tokenStream(effectiveField, lexical);
            log.trace("tokenStream: {}", tokenStream.toString());
            TextFragment[] frags = highlighter.getBestTextFragments(tokenStream, lexical, opts.joinFrags, opts.maxFrags);
            String rez = frags2string(frags, opts);
            log.trace("result: {}, #frags: {}", rez, frags.length);
            literal = NodeFactory.createLiteral(rez, docLang);
        }
        String graf = docDef.getGraphField() != null ? doc.get(docDef.getGraphField()) : null;
        Node graph = graf != null ? TextQueryFuncs.stringToNode(graf) : null;
        Node subject = TextQueryFuncs.stringToNode(entity);
        TextHit hit = new TextHit(subject, sd.score, literal, graph, prop);
        results.add(hit);
    }
    return results;
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) QueryScorer(org.apache.lucene.search.highlight.QueryScorer) Node(org.apache.jena.graph.Node) ArrayList(java.util.ArrayList) Document(org.apache.lucene.document.Document) TextFragment(org.apache.lucene.search.highlight.TextFragment) ScoreDoc(org.apache.lucene.search.ScoreDoc) SimpleFragmenter(org.apache.lucene.search.highlight.SimpleFragmenter) SimpleHTMLFormatter(org.apache.lucene.search.highlight.SimpleHTMLFormatter) Highlighter(org.apache.lucene.search.highlight.Highlighter)

Aggregations

TextFragment (org.apache.lucene.search.highlight.TextFragment)10 TokenStream (org.apache.lucene.analysis.TokenStream)9 InvalidTokenOffsetsException (org.apache.lucene.search.highlight.InvalidTokenOffsetsException)7 IOException (java.io.IOException)5 ArrayList (java.util.ArrayList)5 Map (java.util.Map)5 Document (org.apache.lucene.document.Document)3 IndexableField (org.apache.lucene.index.IndexableField)3 ScoreDoc (org.apache.lucene.search.ScoreDoc)3 Highlighter (org.apache.lucene.search.highlight.Highlighter)3 ImmutableMap (com.google.common.collect.ImmutableMap)2 LinkedList (java.util.LinkedList)2 CachingTokenFilter (org.apache.lucene.analysis.CachingTokenFilter)2 Fields (org.apache.lucene.index.Fields)2 ParseException (org.apache.lucene.queryparser.classic.ParseException)2 QueryNodeException (org.apache.lucene.queryparser.flexible.core.QueryNodeException)2 OffsetLimitTokenFilter (org.apache.lucene.search.highlight.OffsetLimitTokenFilter)2 QueryScorer (org.apache.lucene.search.highlight.QueryScorer)2 SimpleFragmenter (org.apache.lucene.search.highlight.SimpleFragmenter)2 SimpleHTMLFormatter (org.apache.lucene.search.highlight.SimpleHTMLFormatter)2