Search in sources :

Example 6 with BreakIterator

use of java.text.BreakIterator in project elasticsearch by elastic.

the class CustomUnifiedHighlighter method getFieldHighlighter.

@Override
protected FieldHighlighter getFieldHighlighter(String field, Query query, Set<Term> allTerms, int maxPassages) {
    BytesRef[] terms = filterExtractedTerms(getFieldMatcher(field), allTerms);
    Set<HighlightFlag> highlightFlags = getFlags(field);
    PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
    CharacterRunAutomaton[] automata = getAutomata(field, query, highlightFlags);
    OffsetSource offsetSource = getOptimizedOffsetSource(field, terms, phraseHelper, automata);
    BreakIterator breakIterator = new SplittingBreakIterator(getBreakIterator(field), UnifiedHighlighter.MULTIVAL_SEP_CHAR);
    FieldOffsetStrategy strategy = getOffsetStrategy(offsetSource, field, terms, phraseHelper, automata, highlightFlags);
    return new CustomFieldHighlighter(field, strategy, breakIteratorLocale, breakIterator, getScorer(field), maxPassages, (noMatchSize > 0 ? 1 : 0), getFormatter(field), noMatchSize, fieldValue);
}
Also used : CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton) BytesRef(org.apache.lucene.util.BytesRef) BreakIterator(java.text.BreakIterator)

Example 7 with BreakIterator

use of java.text.BreakIterator in project elasticsearch by elastic.

the class CustomSeparatorBreakIteratorTests method testBreakOnCustomSeparator.

public void testBreakOnCustomSeparator() throws Exception {
    Character separator = randomSeparator();
    BreakIterator bi = new CustomSeparatorBreakIterator(separator);
    String source = "this" + separator + "is" + separator + "the" + separator + "first" + separator + "sentence";
    bi.setText(source);
    assertThat(bi.current(), equalTo(0));
    assertThat(bi.first(), equalTo(0));
    assertThat(source.substring(bi.current(), bi.next()), equalTo("this" + separator));
    assertThat(source.substring(bi.current(), bi.next()), equalTo("is" + separator));
    assertThat(source.substring(bi.current(), bi.next()), equalTo("the" + separator));
    assertThat(source.substring(bi.current(), bi.next()), equalTo("first" + separator));
    assertThat(source.substring(bi.current(), bi.next()), equalTo("sentence"));
    assertThat(bi.next(), equalTo(BreakIterator.DONE));
    assertThat(bi.last(), equalTo(source.length()));
    int current = bi.current();
    assertThat(source.substring(bi.previous(), current), equalTo("sentence"));
    current = bi.current();
    assertThat(source.substring(bi.previous(), current), equalTo("first" + separator));
    current = bi.current();
    assertThat(source.substring(bi.previous(), current), equalTo("the" + separator));
    current = bi.current();
    assertThat(source.substring(bi.previous(), current), equalTo("is" + separator));
    current = bi.current();
    assertThat(source.substring(bi.previous(), current), equalTo("this" + separator));
    assertThat(bi.previous(), equalTo(BreakIterator.DONE));
    assertThat(bi.current(), equalTo(0));
    assertThat(source.substring(0, bi.following(9)), equalTo("this" + separator + "is" + separator + "the" + separator));
    assertThat(source.substring(0, bi.preceding(9)), equalTo("this" + separator + "is" + separator));
    assertThat(bi.first(), equalTo(0));
    assertThat(source.substring(0, bi.next(3)), equalTo("this" + separator + "is" + separator + "the" + separator));
}
Also used : BreakIterator(java.text.BreakIterator)

Example 8 with BreakIterator

use of java.text.BreakIterator in project elasticsearch by elastic.

the class CustomSeparatorBreakIteratorTests method testSingleSentences.

public void testSingleSentences() throws Exception {
    BreakIterator expected = BreakIterator.getSentenceInstance(Locale.ROOT);
    BreakIterator actual = new CustomSeparatorBreakIterator(randomSeparator());
    assertSameBreaks("a", expected, actual);
    assertSameBreaks("ab", expected, actual);
    assertSameBreaks("abc", expected, actual);
    assertSameBreaks("", expected, actual);
}
Also used : BreakIterator(java.text.BreakIterator)

Example 9 with BreakIterator

use of java.text.BreakIterator in project Openfire by igniterealtime.

the class StringUtils method wordWrap.

/**
     * Reformats a string where lines that are longer than <tt>width</tt>
     * are split apart at the earliest wordbreak or at maxLength, whichever is
     * sooner. If the width specified is less than 5 or greater than the input
     * Strings length the string will be returned as is.
     * <p>
     * Please note that this method can be lossy - trailing spaces on wrapped
     * lines may be trimmed.</p>
     *
     * @param input the String to reformat.
     * @param width the maximum length of any one line.
     * @return a new String with reformatted as needed.
     */
public static String wordWrap(String input, int width, Locale locale) {
    // protect ourselves
    if (input == null) {
        return "";
    } else if (width < 5) {
        return input;
    } else if (width >= input.length()) {
        return input;
    }
    // default locale
    if (locale == null) {
        locale = JiveGlobals.getLocale();
    }
    StringBuilder buf = new StringBuilder(input);
    boolean endOfLine = false;
    int lineStart = 0;
    for (int i = 0; i < buf.length(); i++) {
        if (buf.charAt(i) == '\n') {
            lineStart = i + 1;
            endOfLine = true;
        }
        // handle splitting at width character
        if (i > lineStart + width - 1) {
            if (!endOfLine) {
                int limit = i - lineStart - 1;
                BreakIterator breaks = BreakIterator.getLineInstance(locale);
                breaks.setText(buf.substring(lineStart, i));
                int end = breaks.last();
                // break character
                if (end == limit + 1) {
                    if (!Character.isWhitespace(buf.charAt(lineStart + end))) {
                        end = breaks.preceding(end - 1);
                    }
                }
                // if the last character is a space, replace it with a \n
                if (end != BreakIterator.DONE && end == limit + 1) {
                    buf.replace(lineStart + end, lineStart + end + 1, "\n");
                    lineStart = lineStart + end;
                } else // otherwise, just insert a \n
                if (end != BreakIterator.DONE && end != 0) {
                    buf.insert(lineStart + end, '\n');
                    lineStart = lineStart + end + 1;
                } else {
                    buf.insert(i, '\n');
                    lineStart = i + 1;
                }
            } else {
                buf.insert(i, '\n');
                lineStart = i + 1;
                endOfLine = false;
            }
        }
    }
    return buf.toString();
}
Also used : BreakIterator(java.text.BreakIterator)

Example 10 with BreakIterator

use of java.text.BreakIterator in project Openfire by igniterealtime.

the class StringUtils method toLowerCaseWordArray.

/**
     * Converts a line of text into an array of lower case words using a
     * BreakIterator.wordInstance().<p>
     *
     * This method is under the Jive Open Source Software License and was
     * written by Mark Imbriaco.
     *
     * @param text a String of text to convert into an array of words
     * @return text broken up into an array of words.
     */
public static String[] toLowerCaseWordArray(String text) {
    if (text == null || text.length() == 0) {
        return new String[0];
    }
    List<String> wordList = new ArrayList<>();
    BreakIterator boundary = BreakIterator.getWordInstance();
    boundary.setText(text);
    int start = 0;
    for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) {
        String tmp = text.substring(start, end).trim();
        // Remove characters that are not needed.
        tmp = replace(tmp, "+", "");
        tmp = replace(tmp, "/", "");
        tmp = replace(tmp, "\\", "");
        tmp = replace(tmp, "#", "");
        tmp = replace(tmp, "*", "");
        tmp = replace(tmp, ")", "");
        tmp = replace(tmp, "(", "");
        tmp = replace(tmp, "&", "");
        if (tmp.length() > 0) {
            wordList.add(tmp);
        }
    }
    return wordList.toArray(new String[wordList.size()]);
}
Also used : ArrayList(java.util.ArrayList) BreakIterator(java.text.BreakIterator)

Aggregations

BreakIterator (java.text.BreakIterator)59 ArrayList (java.util.ArrayList)10 Locale (java.util.Locale)6 IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)3 BytesRef (org.apache.lucene.util.BytesRef)3 Snippet (org.apache.lucene.search.highlight.Snippet)2 Intent (android.content.Intent)1 TagElement (com.google.devtools.j2objc.ast.TagElement)1 Pair (edu.illinois.cs.cogcomp.core.datastructures.Pair)1 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)1 IOException (java.io.IOException)1 Iterator (java.util.Iterator)1 PriorityQueue (java.util.PriorityQueue)1 JComponent (javax.swing.JComponent)1 Text (org.apache.hadoop.io.Text)1 Analyzer (org.apache.lucene.analysis.Analyzer)1 IndexSearcher (org.apache.lucene.search.IndexSearcher)1 Encoder (org.apache.lucene.search.highlight.Encoder)1 CustomSeparatorBreakIterator (org.apache.lucene.search.postingshighlight.CustomSeparatorBreakIterator)1 CustomPassageFormatter (org.apache.lucene.search.uhighlight.CustomPassageFormatter)1