Examples with BreakIterator - java.text.BreakIterator

Example 1 with BreakIterator

use of java.text.BreakIterator in project elasticsearch by elastic.

the class PostingsHighlighter method filterSnippets.

static List<Snippet> filterSnippets(List<Snippet> snippets, int numberOfFragments) {
    //We need to filter the snippets as due to no_match_size we could have
    //either highlighted snippets or non highlighted ones and we don't want to mix those up
    List<Snippet> filteredSnippets = new ArrayList<>(snippets.size());
    for (Snippet snippet : snippets) {
        if (snippet.isHighlighted()) {
            filteredSnippets.add(snippet);
        }
    }
    //otherwise we return the first non highlighted one if available
    if (filteredSnippets.size() == 0) {
        if (snippets.size() > 0) {
            Snippet snippet = snippets.get(0);
            //we need to return the first sentence of the content rather than the whole content
            if (numberOfFragments == 0) {
                BreakIterator bi = BreakIterator.getSentenceInstance(Locale.ROOT);
                String text = snippet.getText();
                bi.setText(text);
                int next = bi.next();
                if (next != BreakIterator.DONE) {
                    String newText = text.substring(0, next).trim();
                    snippet = new Snippet(newText, snippet.getScore(), snippet.isHighlighted());
                }
            }
            filteredSnippets.add(snippet);
        }
    }
    return filteredSnippets;
}

Also used : ArrayList(java.util.ArrayList) Snippet(org.apache.lucene.search.highlight.Snippet) CustomSeparatorBreakIterator(org.apache.lucene.search.postingshighlight.CustomSeparatorBreakIterator) BreakIterator(java.text.BreakIterator)

Example 2 with BreakIterator

use of java.text.BreakIterator in project elasticsearch by elastic.

the class UnifiedHighlighter method highlight.

@Override
public HighlightField highlight(HighlighterContext highlighterContext) {
    FieldMapper fieldMapper = highlighterContext.mapper;
    SearchContextHighlight.Field field = highlighterContext.field;
    SearchContext context = highlighterContext.context;
    FetchSubPhase.HitContext hitContext = highlighterContext.hitContext;
    if (!hitContext.cache().containsKey(CACHE_KEY)) {
        hitContext.cache().put(CACHE_KEY, new HighlighterEntry());
    }
    HighlighterEntry highlighterEntry = (HighlighterEntry) hitContext.cache().get(CACHE_KEY);
    MapperHighlighterEntry mapperHighlighterEntry = highlighterEntry.mappers.get(fieldMapper);
    if (mapperHighlighterEntry == null) {
        Encoder encoder = field.fieldOptions().encoder().equals("html") ? HighlightUtils.Encoders.HTML : HighlightUtils.Encoders.DEFAULT;
        CustomPassageFormatter passageFormatter = new CustomPassageFormatter(field.fieldOptions().preTags()[0], field.fieldOptions().postTags()[0], encoder);
        mapperHighlighterEntry = new MapperHighlighterEntry(passageFormatter);
    }
    List<Snippet> snippets = new ArrayList<>();
    int numberOfFragments;
    try {
        Analyzer analyzer = context.mapperService().documentMapper(hitContext.hit().getType()).mappers().indexAnalyzer();
        List<Object> fieldValues = HighlightUtils.loadFieldValues(field, fieldMapper, context, hitContext);
        fieldValues = fieldValues.stream().map(obj -> {
            if (obj instanceof BytesRef) {
                return fieldMapper.fieldType().valueForDisplay(obj).toString();
            } else {
                return obj;
            }
        }).collect(Collectors.toList());
        IndexSearcher searcher = new IndexSearcher(hitContext.reader());
        CustomUnifiedHighlighter highlighter;
        if (field.fieldOptions().numberOfFragments() == 0) {
            // we use a control char to separate values, which is the only char that the custom break iterator
            // breaks the text on, so we don't lose the distinction between the different values of a field and we
            // get back a snippet per value
            String fieldValue = mergeFieldValues(fieldValues, MULTIVAL_SEP_CHAR);
            org.apache.lucene.search.postingshighlight.CustomSeparatorBreakIterator breakIterator = new org.apache.lucene.search.postingshighlight.CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR);
            highlighter = new CustomUnifiedHighlighter(searcher, analyzer, mapperHighlighterEntry.passageFormatter, field.fieldOptions().boundaryScannerLocale(), breakIterator, fieldValue, field.fieldOptions().noMatchSize());
            // we are highlighting the whole content, one snippet per value
            numberOfFragments = fieldValues.size();
        } else {
            //using paragraph separator we make sure that each field value holds a discrete passage for highlighting
            String fieldValue = mergeFieldValues(fieldValues, MULTIVAL_SEP_CHAR);
            BreakIterator bi = getBreakIterator(field);
            highlighter = new CustomUnifiedHighlighter(searcher, analyzer, mapperHighlighterEntry.passageFormatter, field.fieldOptions().boundaryScannerLocale(), bi, fieldValue, field.fieldOptions().noMatchSize());
            numberOfFragments = field.fieldOptions().numberOfFragments();
        }
        if (field.fieldOptions().requireFieldMatch()) {
            final String fieldName = highlighterContext.fieldName;
            highlighter.setFieldMatcher((name) -> fieldName.equals(name));
        } else {
            highlighter.setFieldMatcher((name) -> true);
        }
        Snippet[] fieldSnippets = highlighter.highlightField(highlighterContext.fieldName, highlighterContext.query, hitContext.docId(), numberOfFragments);
        for (Snippet fieldSnippet : fieldSnippets) {
            if (Strings.hasText(fieldSnippet.getText())) {
                snippets.add(fieldSnippet);
            }
        }
    } catch (IOException e) {
        throw new FetchPhaseExecutionException(context, "Failed to highlight field [" + highlighterContext.fieldName + "]", e);
    }
    snippets = filterSnippets(snippets, field.fieldOptions().numberOfFragments());
    if (field.fieldOptions().scoreOrdered()) {
        //let's sort the snippets by score if needed
        CollectionUtil.introSort(snippets, (o1, o2) -> Double.compare(o2.getScore(), o1.getScore()));
    }
    String[] fragments = new String[snippets.size()];
    for (int i = 0; i < fragments.length; i++) {
        fragments[i] = snippets.get(i).getText();
    }
    if (fragments.length > 0) {
        return new HighlightField(highlighterContext.fieldName, Text.convertFromStringArray(fragments));
    }
    return null;
}

Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) ArrayList(java.util.ArrayList) SearchContext(org.elasticsearch.search.internal.SearchContext) Analyzer(org.apache.lucene.analysis.Analyzer) BreakIterator(java.text.BreakIterator) Encoder(org.apache.lucene.search.highlight.Encoder) FetchSubPhase(org.elasticsearch.search.fetch.FetchSubPhase) CustomPassageFormatter(org.apache.lucene.search.uhighlight.CustomPassageFormatter) BytesRef(org.apache.lucene.util.BytesRef) CustomUnifiedHighlighter(org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter) Snippet(org.apache.lucene.search.highlight.Snippet) IOException(java.io.IOException) FetchPhaseExecutionException(org.elasticsearch.search.fetch.FetchPhaseExecutionException) FieldMapper(org.elasticsearch.index.mapper.FieldMapper)

Example 3 with BreakIterator

use of java.text.BreakIterator in project elasticsearch by elastic.

the class CustomUnifiedHighlighter method getFieldHighlighter.

@Override
protected FieldHighlighter getFieldHighlighter(String field, Query query, Set<Term> allTerms, int maxPassages) {
    BytesRef[] terms = filterExtractedTerms(getFieldMatcher(field), allTerms);
    Set<HighlightFlag> highlightFlags = getFlags(field);
    PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
    CharacterRunAutomaton[] automata = getAutomata(field, query, highlightFlags);
    OffsetSource offsetSource = getOptimizedOffsetSource(field, terms, phraseHelper, automata);
    BreakIterator breakIterator = new SplittingBreakIterator(getBreakIterator(field), UnifiedHighlighter.MULTIVAL_SEP_CHAR);
    FieldOffsetStrategy strategy = getOffsetStrategy(offsetSource, field, terms, phraseHelper, automata, highlightFlags);
    return new CustomFieldHighlighter(field, strategy, breakIteratorLocale, breakIterator, getScorer(field), maxPassages, (noMatchSize > 0 ? 1 : 0), getFormatter(field), noMatchSize, fieldValue);
}

Also used : CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton) BytesRef(org.apache.lucene.util.BytesRef) BreakIterator(java.text.BreakIterator)

Example 4 with BreakIterator

use of java.text.BreakIterator in project elasticsearch by elastic.

the class CustomSeparatorBreakIteratorTests method testBreakOnCustomSeparator.

public void testBreakOnCustomSeparator() throws Exception {
    Character separator = randomSeparator();
    BreakIterator bi = new CustomSeparatorBreakIterator(separator);
    String source = "this" + separator + "is" + separator + "the" + separator + "first" + separator + "sentence";
    bi.setText(source);
    assertThat(bi.current(), equalTo(0));
    assertThat(bi.first(), equalTo(0));
    assertThat(source.substring(bi.current(), bi.next()), equalTo("this" + separator));
    assertThat(source.substring(bi.current(), bi.next()), equalTo("is" + separator));
    assertThat(source.substring(bi.current(), bi.next()), equalTo("the" + separator));
    assertThat(source.substring(bi.current(), bi.next()), equalTo("first" + separator));
    assertThat(source.substring(bi.current(), bi.next()), equalTo("sentence"));
    assertThat(bi.next(), equalTo(BreakIterator.DONE));
    assertThat(bi.last(), equalTo(source.length()));
    int current = bi.current();
    assertThat(source.substring(bi.previous(), current), equalTo("sentence"));
    current = bi.current();
    assertThat(source.substring(bi.previous(), current), equalTo("first" + separator));
    current = bi.current();
    assertThat(source.substring(bi.previous(), current), equalTo("the" + separator));
    current = bi.current();
    assertThat(source.substring(bi.previous(), current), equalTo("is" + separator));
    current = bi.current();
    assertThat(source.substring(bi.previous(), current), equalTo("this" + separator));
    assertThat(bi.previous(), equalTo(BreakIterator.DONE));
    assertThat(bi.current(), equalTo(0));
    assertThat(source.substring(0, bi.following(9)), equalTo("this" + separator + "is" + separator + "the" + separator));
    assertThat(source.substring(0, bi.preceding(9)), equalTo("this" + separator + "is" + separator));
    assertThat(bi.first(), equalTo(0));
    assertThat(source.substring(0, bi.next(3)), equalTo("this" + separator + "is" + separator + "the" + separator));
}

Also used : BreakIterator(java.text.BreakIterator)

Example 5 with BreakIterator

use of java.text.BreakIterator in project elasticsearch by elastic.

the class CustomSeparatorBreakIteratorTests method testSingleSentences.

public void testSingleSentences() throws Exception {
    BreakIterator expected = BreakIterator.getSentenceInstance(Locale.ROOT);
    BreakIterator actual = new CustomSeparatorBreakIterator(randomSeparator());
    assertSameBreaks("a", expected, actual);
    assertSameBreaks("ab", expected, actual);
    assertSameBreaks("abc", expected, actual);
    assertSameBreaks("", expected, actual);
}

Also used : BreakIterator(java.text.BreakIterator)

Aggregations

BreakIterator (java.text.BreakIterator)112 ArrayList (java.util.ArrayList)17 Locale (java.util.Locale)9 IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)3 BytesRef (org.apache.lucene.util.BytesRef)3 Sentence (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)2 Collection (java.util.Collection)2 BadLocationException (javax.swing.text.BadLocationException)2 Document (javax.swing.text.Document)2 Element (javax.swing.text.Element)2 Segment (javax.swing.text.Segment)2 Snippet (org.apache.lucene.search.highlight.Snippet)2 StyleSpansBuilder (org.fxmisc.richtext.model.StyleSpansBuilder)2 Intent (android.content.Intent)1 Paint (android.graphics.Paint)1 RectF (android.graphics.RectF)1 TextPaint (android.text.TextPaint)1 TagElement (com.google.devtools.j2objc.ast.TagElement)1 Token (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token)1 AbstractNLPDecoder (edu.emory.mathcs.nlp.decode.AbstractNLPDecoder)1