Search in sources :

Example 46 with BreakIterator

use of java.text.BreakIterator in project jdk8u_jdk by JetBrains.

the class GlyphView method getBreakSpot.

/**
     * Returns a location to break at in the passed in region, or
     * BreakIterator.DONE if there isn't a good location to break at
     * in the specified region.
     */
private int getBreakSpot(int p0, int p1) {
    if (breakSpots == null) {
        // Re-calculate breakpoints for the whole view
        int start = getStartOffset();
        int end = getEndOffset();
        int[] bs = new int[end + 1 - start];
        int ix = 0;
        // Breaker should work on the parent element because there may be
        // a valid breakpoint at the end edge of the view (space, etc.)
        Element parent = getElement().getParentElement();
        int pstart = (parent == null ? start : parent.getStartOffset());
        int pend = (parent == null ? end : parent.getEndOffset());
        Segment s = getText(pstart, pend);
        s.first();
        BreakIterator breaker = getBreaker();
        breaker.setText(s);
        // Backward search should start from end+1 unless there's NO end+1
        int startFrom = end + (pend > end ? 1 : 0);
        for (; ; ) {
            startFrom = breaker.preceding(s.offset + (startFrom - pstart)) + (pstart - s.offset);
            if (startFrom > start) {
                // The break spot is within the view
                bs[ix++] = startFrom;
            } else {
                break;
            }
        }
        SegmentCache.releaseSharedSegment(s);
        breakSpots = new int[ix];
        System.arraycopy(bs, 0, breakSpots, 0, ix);
    }
    int breakSpot = BreakIterator.DONE;
    for (int i = 0; i < breakSpots.length; i++) {
        int bsp = breakSpots[i];
        if (bsp <= p1) {
            if (bsp > p0) {
                breakSpot = bsp;
            }
            break;
        }
    }
    return breakSpot;
}
Also used : BreakIterator(java.text.BreakIterator)

Example 47 with BreakIterator

use of java.text.BreakIterator in project cogcomp-nlp by CogComp.

the class ThaiTokenizer method tokenizeSentence.

/**
     * given a sentence, return a set of tokens and their character offsets
     *
     * @param text The sentence string
     * @return A {@link Pair} containing the array of tokens and their character offsets
     */
@Override
public Pair<String[], IntPair[]> tokenizeSentence(String text) {
    List<IntPair> offsets = new ArrayList<>();
    List<String> surfaces = new ArrayList<>();
    List<Integer> sen_ends = new ArrayList<>();
    BreakIterator boundary = BreakIterator.getWordInstance(new Locale("th", "TH", "TH"));
    boundary.setText(text);
    int start = boundary.first();
    for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) {
        //            System.out.println(start+" "+end+" "+text.length());
        String sur = text.substring(start, end);
        if (sur.trim().isEmpty()) {
            //                    sen_ends.add(surfaces.size());
            continue;
        }
        surfaces.add(sur);
        offsets.add(new IntPair(start, end));
    }
    if (surfaces.size() > 0 && (sen_ends.size() == 0 || sen_ends.get(sen_ends.size() - 1) != surfaces.size()))
        sen_ends.add(surfaces.size());
    IntPair[] offs = new IntPair[offsets.size()];
    offs = offsets.toArray(offs);
    String[] surfs = new String[surfaces.size()];
    surfs = surfaces.toArray(surfs);
    return new Pair(surfs, offs);
}
Also used : Locale(java.util.Locale) ArrayList(java.util.ArrayList) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) BreakIterator(java.text.BreakIterator) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair)

Example 48 with BreakIterator

use of java.text.BreakIterator in project cogcomp-nlp by CogComp.

the class ThaiTokenizer method tokenizeTextSpan.

/**
     * given a span of text, return a list of Pair{@literal < String[], IntPair[] >} corresponding
     * to tokenized sentences, where the String[] is the ordered list of sentence tokens and the
     * IntPair[] is the corresponding list of character offsets with respect to <b>the original
     * text</b>.
     *
     * @param textSpan
     */
@Override
public Tokenization tokenizeTextSpan(String textSpan) {
    List<IntPair> offsets = new ArrayList<>();
    List<String> surfaces = new ArrayList<>();
    List<Integer> sen_ends = new ArrayList<>();
    BreakIterator boundary = BreakIterator.getWordInstance(new Locale("th", "TH", "TH"));
    boundary.setText(textSpan);
    int start = boundary.first();
    for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) {
        //            System.out.println(start+" "+end+" "+text.length());
        String sur = textSpan.substring(start, end);
        if (sur.trim().isEmpty()) {
            //                    sen_ends.add(surfaces.size());
            continue;
        }
        surfaces.add(sur);
        offsets.add(new IntPair(start, end));
    }
    if (surfaces.size() > 0 && (sen_ends.size() == 0 || sen_ends.get(sen_ends.size() - 1) != surfaces.size()))
        sen_ends.add(surfaces.size());
    IntPair[] offs = new IntPair[offsets.size()];
    offs = offsets.toArray(offs);
    String[] surfs = new String[surfaces.size()];
    surfs = surfaces.toArray(surfs);
    int[] ends = new int[sen_ends.size()];
    for (int i = 0; i < sen_ends.size(); i++) ends[i] = sen_ends.get(i);
    return new Tokenization(surfs, offs, ends);
}
Also used : Locale(java.util.Locale) ArrayList(java.util.ArrayList) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) BreakIterator(java.text.BreakIterator)

Example 49 with BreakIterator

use of java.text.BreakIterator in project geode by apache.

the class LogWriterImpl method formatText.

static void formatText(PrintWriter writer, String target, int initialLength) {
    BreakIterator boundary = BreakIterator.getLineInstance();
    boundary.setText(target);
    int start = boundary.first();
    int end = boundary.next();
    int lineLength = initialLength;
    while (end != BreakIterator.DONE) {
        // Look at the end and only accept whitespace breaks
        char endChar = target.charAt(end - 1);
        while (!Character.isWhitespace(endChar)) {
            int lastEnd = end;
            end = boundary.next();
            if (end == BreakIterator.DONE) {
                // give up. We are at the end of the string
                end = lastEnd;
                break;
            }
            endChar = target.charAt(end - 1);
        }
        int wordEnd = end;
        if (endChar == '\n') {
            // trim off the \n since println will do it for us
            wordEnd--;
            if (wordEnd > 0 && target.charAt(wordEnd - 1) == '\r') {
                wordEnd--;
            }
        } else if (endChar == '\t') {
            // figure tabs use 8 characters
            lineLength += 7;
        }
        String word = target.substring(start, wordEnd);
        lineLength += word.length();
        writer.print(word);
        if (endChar == '\n' || endChar == '\r') {
            // force end of line
            writer.println();
            writer.print("  ");
            lineLength = 2;
        }
        start = end;
        end = boundary.next();
    }
    if (lineLength != 0) {
        writer.println();
    }
}
Also used : BreakIterator(java.text.BreakIterator)

Example 50 with BreakIterator

use of java.text.BreakIterator in project lucene-solr by apache.

the class BreakIteratorBoundaryScannerTest method testWordBoundary.

public void testWordBoundary() throws Exception {
    StringBuilder text = new StringBuilder(TEXT);
    BreakIterator bi = BreakIterator.getWordInstance(Locale.ROOT);
    BoundaryScanner scanner = new BreakIteratorBoundaryScanner(bi);
    int start = TEXT.indexOf("formance");
    int expected = TEXT.indexOf("high-performance");
    testFindStartOffset(text, start, expected, scanner);
    expected = TEXT.indexOf(", full");
    testFindEndOffset(text, start, expected, scanner);
}
Also used : BreakIterator(java.text.BreakIterator)

Aggregations

BreakIterator (java.text.BreakIterator)59 ArrayList (java.util.ArrayList)10 Locale (java.util.Locale)6 IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)3 BytesRef (org.apache.lucene.util.BytesRef)3 Snippet (org.apache.lucene.search.highlight.Snippet)2 Intent (android.content.Intent)1 TagElement (com.google.devtools.j2objc.ast.TagElement)1 Pair (edu.illinois.cs.cogcomp.core.datastructures.Pair)1 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)1 IOException (java.io.IOException)1 Iterator (java.util.Iterator)1 PriorityQueue (java.util.PriorityQueue)1 JComponent (javax.swing.JComponent)1 Text (org.apache.hadoop.io.Text)1 Analyzer (org.apache.lucene.analysis.Analyzer)1 IndexSearcher (org.apache.lucene.search.IndexSearcher)1 Encoder (org.apache.lucene.search.highlight.Encoder)1 CustomSeparatorBreakIterator (org.apache.lucene.search.postingshighlight.CustomSeparatorBreakIterator)1 CustomPassageFormatter (org.apache.lucene.search.uhighlight.CustomPassageFormatter)1