Search in sources :

Example 1 with Formatter

use of org.apache.lucene.search.highlight.Formatter in project elasticsearch by elastic.

the class PlainHighlighter method highlight.

@Override
public HighlightField highlight(HighlighterContext highlighterContext) {
    SearchContextHighlight.Field field = highlighterContext.field;
    SearchContext context = highlighterContext.context;
    FetchSubPhase.HitContext hitContext = highlighterContext.hitContext;
    FieldMapper mapper = highlighterContext.mapper;
    Encoder encoder = field.fieldOptions().encoder().equals("html") ? HighlightUtils.Encoders.HTML : HighlightUtils.Encoders.DEFAULT;
    if (!hitContext.cache().containsKey(CACHE_KEY)) {
        Map<FieldMapper, org.apache.lucene.search.highlight.Highlighter> mappers = new HashMap<>();
        hitContext.cache().put(CACHE_KEY, mappers);
    }
    @SuppressWarnings("unchecked") Map<FieldMapper, org.apache.lucene.search.highlight.Highlighter> cache = (Map<FieldMapper, org.apache.lucene.search.highlight.Highlighter>) hitContext.cache().get(CACHE_KEY);
    org.apache.lucene.search.highlight.Highlighter entry = cache.get(mapper);
    if (entry == null) {
        QueryScorer queryScorer = new CustomQueryScorer(highlighterContext.query, field.fieldOptions().requireFieldMatch() ? mapper.fieldType().name() : null);
        queryScorer.setExpandMultiTermQuery(true);
        Fragmenter fragmenter;
        if (field.fieldOptions().numberOfFragments() == 0) {
            fragmenter = new NullFragmenter();
        } else if (field.fieldOptions().fragmenter() == null) {
            fragmenter = new SimpleSpanFragmenter(queryScorer, field.fieldOptions().fragmentCharSize());
        } else if ("simple".equals(field.fieldOptions().fragmenter())) {
            fragmenter = new SimpleFragmenter(field.fieldOptions().fragmentCharSize());
        } else if ("span".equals(field.fieldOptions().fragmenter())) {
            fragmenter = new SimpleSpanFragmenter(queryScorer, field.fieldOptions().fragmentCharSize());
        } else {
            throw new IllegalArgumentException("unknown fragmenter option [" + field.fieldOptions().fragmenter() + "] for the field [" + highlighterContext.fieldName + "]");
        }
        Formatter formatter = new SimpleHTMLFormatter(field.fieldOptions().preTags()[0], field.fieldOptions().postTags()[0]);
        entry = new org.apache.lucene.search.highlight.Highlighter(formatter, encoder, queryScorer);
        entry.setTextFragmenter(fragmenter);
        // always highlight across all data
        entry.setMaxDocCharsToAnalyze(Integer.MAX_VALUE);
        cache.put(mapper, entry);
    }
    // a HACK to make highlighter do highlighting, even though its using the single frag list builder
    int numberOfFragments = field.fieldOptions().numberOfFragments() == 0 ? 1 : field.fieldOptions().numberOfFragments();
    ArrayList<TextFragment> fragsList = new ArrayList<>();
    List<Object> textsToHighlight;
    Analyzer analyzer = context.mapperService().documentMapper(hitContext.hit().getType()).mappers().indexAnalyzer();
    try {
        textsToHighlight = HighlightUtils.loadFieldValues(field, mapper, context, hitContext);
        for (Object textToHighlight : textsToHighlight) {
            String text;
            if (textToHighlight instanceof BytesRef) {
                text = mapper.fieldType().valueForDisplay(textToHighlight).toString();
            } else {
                text = textToHighlight.toString();
            }
            try (TokenStream tokenStream = analyzer.tokenStream(mapper.fieldType().name(), text)) {
                if (!tokenStream.hasAttribute(CharTermAttribute.class) || !tokenStream.hasAttribute(OffsetAttribute.class)) {
                    // can't perform highlighting if the stream has no terms (binary token stream) or no offsets
                    continue;
                }
                TextFragment[] bestTextFragments = entry.getBestTextFragments(tokenStream, text, false, numberOfFragments);
                for (TextFragment bestTextFragment : bestTextFragments) {
                    if (bestTextFragment != null && bestTextFragment.getScore() > 0) {
                        fragsList.add(bestTextFragment);
                    }
                }
            }
        }
    } catch (Exception e) {
        if (ExceptionsHelper.unwrap(e, BytesRefHash.MaxBytesLengthExceededException.class) != null) {
            // the plain highlighter will parse the source and try to analyze it.
            return null;
        } else {
            throw new FetchPhaseExecutionException(context, "Failed to highlight field [" + highlighterContext.fieldName + "]", e);
        }
    }
    if (field.fieldOptions().scoreOrdered()) {
        CollectionUtil.introSort(fragsList, new Comparator<TextFragment>() {

            @Override
            public int compare(TextFragment o1, TextFragment o2) {
                return Math.round(o2.getScore() - o1.getScore());
            }
        });
    }
    String[] fragments;
    // number_of_fragments is set to 0 but we have a multivalued field
    if (field.fieldOptions().numberOfFragments() == 0 && textsToHighlight.size() > 1 && fragsList.size() > 0) {
        fragments = new String[fragsList.size()];
        for (int i = 0; i < fragsList.size(); i++) {
            fragments[i] = fragsList.get(i).toString();
        }
    } else {
        // refine numberOfFragments if needed
        numberOfFragments = fragsList.size() < numberOfFragments ? fragsList.size() : numberOfFragments;
        fragments = new String[numberOfFragments];
        for (int i = 0; i < fragments.length; i++) {
            fragments[i] = fragsList.get(i).toString();
        }
    }
    if (fragments.length > 0) {
        return new HighlightField(highlighterContext.fieldName, Text.convertFromStringArray(fragments));
    }
    int noMatchSize = highlighterContext.field.fieldOptions().noMatchSize();
    if (noMatchSize > 0 && textsToHighlight.size() > 0) {
        // Pull an excerpt from the beginning of the string but make sure to split the string on a term boundary.
        String fieldContents = textsToHighlight.get(0).toString();
        int end;
        try {
            end = findGoodEndForNoHighlightExcerpt(noMatchSize, analyzer, mapper.fieldType().name(), fieldContents);
        } catch (Exception e) {
            throw new FetchPhaseExecutionException(context, "Failed to highlight field [" + highlighterContext.fieldName + "]", e);
        }
        if (end > 0) {
            return new HighlightField(highlighterContext.fieldName, new Text[] { new Text(fieldContents.substring(0, end)) });
        }
    }
    return null;
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) HashMap(java.util.HashMap) Formatter(org.apache.lucene.search.highlight.Formatter) SimpleHTMLFormatter(org.apache.lucene.search.highlight.SimpleHTMLFormatter) ArrayList(java.util.ArrayList) SearchContext(org.elasticsearch.search.internal.SearchContext) TextFragment(org.apache.lucene.search.highlight.TextFragment) Analyzer(org.apache.lucene.analysis.Analyzer) SimpleFragmenter(org.apache.lucene.search.highlight.SimpleFragmenter) Encoder(org.apache.lucene.search.highlight.Encoder) SimpleFragmenter(org.apache.lucene.search.highlight.SimpleFragmenter) Fragmenter(org.apache.lucene.search.highlight.Fragmenter) SimpleSpanFragmenter(org.apache.lucene.search.highlight.SimpleSpanFragmenter) NullFragmenter(org.apache.lucene.search.highlight.NullFragmenter) FetchSubPhase(org.elasticsearch.search.fetch.FetchSubPhase) BytesRefHash(org.apache.lucene.util.BytesRefHash) BytesRef(org.apache.lucene.util.BytesRef) SimpleSpanFragmenter(org.apache.lucene.search.highlight.SimpleSpanFragmenter) QueryScorer(org.apache.lucene.search.highlight.QueryScorer) Text(org.elasticsearch.common.text.Text) NullFragmenter(org.apache.lucene.search.highlight.NullFragmenter) FetchPhaseExecutionException(org.elasticsearch.search.fetch.FetchPhaseExecutionException) IOException(java.io.IOException) FetchPhaseExecutionException(org.elasticsearch.search.fetch.FetchPhaseExecutionException) SimpleHTMLFormatter(org.apache.lucene.search.highlight.SimpleHTMLFormatter) FieldMapper(org.elasticsearch.index.mapper.FieldMapper) HashMap(java.util.HashMap) Map(java.util.Map)

Example 2 with Formatter

use of org.apache.lucene.search.highlight.Formatter in project epadd by ePADD.

the class Highlighter method highlight.

private static String highlight(String content, String term, String preTag, String postTag) throws IOException, ParseException, InvalidTokenOffsetsException {
    // The Lucene Highlighter is used in a hacky way here, it is intended to be used to retrieve fragments from a matching Lucene document.
    // The Lucene Highlighter introduces tags around every token that matched the query, hence it is required to merge these fragmented annotations into one inorder to fit our needs.
    // To truly differentiate contiguous fragments that match a term supplied we add a unique id to the pretag, hence the randum instance
    // TODO: Explain what is happening here
    // Version lv = Indexer.LUCENE_VERSION;
    // hell with reset close, stuff. initialized two analyzers to evade the problem.
    // TODO: get rid of two analyzers.
    Analyzer snAnalyzer, snAnalyzer2;
    snAnalyzer = new EnglishNumberAnalyzer(CharArraySet.EMPTY_SET);
    snAnalyzer2 = new EnglishNumberAnalyzer(CharArraySet.EMPTY_SET);
    Fragmenter fragmenter = new NullFragmenter();
    QueryParser qp = new MultiFieldQueryParser(new String[] { "" }, snAnalyzer2);
    BooleanQuery.Builder querybuilder = new BooleanQuery.Builder();
    TokenStream stream = snAnalyzer.tokenStream(null, new StringReader(content));
    int r = randnum.nextInt();
    String upreTag = preTag.replaceAll(">$", " data-ignore=" + r + " >");
    Formatter formatter = new SimpleHTMLFormatter(upreTag, postTag);
    // Parse exception may occur while parsing terms like "AND", "OR" etc.
    try {
        querybuilder.add(new BooleanClause(qp.parse(term), BooleanClause.Occur.SHOULD));
    } catch (ParseException pe) {
        if (log.isDebugEnabled())
            log.debug("Exception while parsing: " + term, pe);
        return content;
    }
    Scorer scorer = new QueryScorer(querybuilder.build());
    org.apache.lucene.search.highlight.Highlighter highlighter = new org.apache.lucene.search.highlight.Highlighter(formatter, scorer);
    highlighter.setTextFragmenter(fragmenter);
    highlighter.setMaxDocCharsToAnalyze(Math.max(org.apache.lucene.search.highlight.Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE, content.length()));
    String result = highlighter.getBestFragment(stream, content);
    snAnalyzer.close();
    snAnalyzer2.close();
    if (result != null) {
        result = mergeContiguousFragments(result, term, upreTag, postTag);
        // and then remove the extra info. we appended to the tags
        result = result.replaceAll(" data-ignore=" + r + " >", ">");
        return result;
    } else
        return content;
}
Also used : BooleanQuery(org.apache.lucene.search.BooleanQuery) TokenStream(org.apache.lucene.analysis.TokenStream) Formatter(org.apache.lucene.search.highlight.Formatter) Analyzer(org.apache.lucene.analysis.Analyzer) org.apache.lucene.search.highlight(org.apache.lucene.search.highlight) StringReader(java.io.StringReader) MultiFieldQueryParser(org.apache.lucene.queryparser.classic.MultiFieldQueryParser) BooleanClause(org.apache.lucene.search.BooleanClause) MultiFieldQueryParser(org.apache.lucene.queryparser.classic.MultiFieldQueryParser) QueryParser(org.apache.lucene.queryparser.classic.QueryParser) ParseException(org.apache.lucene.queryparser.classic.ParseException)

Aggregations

Analyzer (org.apache.lucene.analysis.Analyzer)2 TokenStream (org.apache.lucene.analysis.TokenStream)2 Formatter (org.apache.lucene.search.highlight.Formatter)2 IOException (java.io.IOException)1 StringReader (java.io.StringReader)1 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 Map (java.util.Map)1 MultiFieldQueryParser (org.apache.lucene.queryparser.classic.MultiFieldQueryParser)1 ParseException (org.apache.lucene.queryparser.classic.ParseException)1 QueryParser (org.apache.lucene.queryparser.classic.QueryParser)1 BooleanClause (org.apache.lucene.search.BooleanClause)1 BooleanQuery (org.apache.lucene.search.BooleanQuery)1 org.apache.lucene.search.highlight (org.apache.lucene.search.highlight)1 Encoder (org.apache.lucene.search.highlight.Encoder)1 Fragmenter (org.apache.lucene.search.highlight.Fragmenter)1 NullFragmenter (org.apache.lucene.search.highlight.NullFragmenter)1 QueryScorer (org.apache.lucene.search.highlight.QueryScorer)1 SimpleFragmenter (org.apache.lucene.search.highlight.SimpleFragmenter)1 SimpleHTMLFormatter (org.apache.lucene.search.highlight.SimpleHTMLFormatter)1