use of java.text.BreakIterator in project jdk8u_jdk by JetBrains.
the class GlyphView method getBreakSpot.
/**
* Returns a location to break at in the passed in region, or
* BreakIterator.DONE if there isn't a good location to break at
* in the specified region.
*/
private int getBreakSpot(int p0, int p1) {
if (breakSpots == null) {
// Re-calculate breakpoints for the whole view
int start = getStartOffset();
int end = getEndOffset();
int[] bs = new int[end + 1 - start];
int ix = 0;
// Breaker should work on the parent element because there may be
// a valid breakpoint at the end edge of the view (space, etc.)
Element parent = getElement().getParentElement();
int pstart = (parent == null ? start : parent.getStartOffset());
int pend = (parent == null ? end : parent.getEndOffset());
Segment s = getText(pstart, pend);
s.first();
BreakIterator breaker = getBreaker();
breaker.setText(s);
// Backward search should start from end+1 unless there's NO end+1
int startFrom = end + (pend > end ? 1 : 0);
for (; ; ) {
startFrom = breaker.preceding(s.offset + (startFrom - pstart)) + (pstart - s.offset);
if (startFrom > start) {
// The break spot is within the view
bs[ix++] = startFrom;
} else {
break;
}
}
SegmentCache.releaseSharedSegment(s);
breakSpots = new int[ix];
System.arraycopy(bs, 0, breakSpots, 0, ix);
}
int breakSpot = BreakIterator.DONE;
for (int i = 0; i < breakSpots.length; i++) {
int bsp = breakSpots[i];
if (bsp <= p1) {
if (bsp > p0) {
breakSpot = bsp;
}
break;
}
}
return breakSpot;
}
use of java.text.BreakIterator in project cogcomp-nlp by CogComp.
the class ThaiTokenizer method tokenizeSentence.
/**
* given a sentence, return a set of tokens and their character offsets
*
* @param text The sentence string
* @return A {@link Pair} containing the array of tokens and their character offsets
*/
@Override
public Pair<String[], IntPair[]> tokenizeSentence(String text) {
List<IntPair> offsets = new ArrayList<>();
List<String> surfaces = new ArrayList<>();
List<Integer> sen_ends = new ArrayList<>();
BreakIterator boundary = BreakIterator.getWordInstance(new Locale("th", "TH", "TH"));
boundary.setText(text);
int start = boundary.first();
for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) {
// System.out.println(start+" "+end+" "+text.length());
String sur = text.substring(start, end);
if (sur.trim().isEmpty()) {
// sen_ends.add(surfaces.size());
continue;
}
surfaces.add(sur);
offsets.add(new IntPair(start, end));
}
if (surfaces.size() > 0 && (sen_ends.size() == 0 || sen_ends.get(sen_ends.size() - 1) != surfaces.size()))
sen_ends.add(surfaces.size());
IntPair[] offs = new IntPair[offsets.size()];
offs = offsets.toArray(offs);
String[] surfs = new String[surfaces.size()];
surfs = surfaces.toArray(surfs);
return new Pair(surfs, offs);
}
use of java.text.BreakIterator in project cogcomp-nlp by CogComp.
the class ThaiTokenizer method tokenizeTextSpan.
/**
* given a span of text, return a list of Pair{@literal < String[], IntPair[] >} corresponding
* to tokenized sentences, where the String[] is the ordered list of sentence tokens and the
* IntPair[] is the corresponding list of character offsets with respect to <b>the original
* text</b>.
*
* @param textSpan
*/
@Override
public Tokenization tokenizeTextSpan(String textSpan) {
List<IntPair> offsets = new ArrayList<>();
List<String> surfaces = new ArrayList<>();
List<Integer> sen_ends = new ArrayList<>();
BreakIterator boundary = BreakIterator.getWordInstance(new Locale("th", "TH", "TH"));
boundary.setText(textSpan);
int start = boundary.first();
for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) {
// System.out.println(start+" "+end+" "+text.length());
String sur = textSpan.substring(start, end);
if (sur.trim().isEmpty()) {
// sen_ends.add(surfaces.size());
continue;
}
surfaces.add(sur);
offsets.add(new IntPair(start, end));
}
if (surfaces.size() > 0 && (sen_ends.size() == 0 || sen_ends.get(sen_ends.size() - 1) != surfaces.size()))
sen_ends.add(surfaces.size());
IntPair[] offs = new IntPair[offsets.size()];
offs = offsets.toArray(offs);
String[] surfs = new String[surfaces.size()];
surfs = surfaces.toArray(surfs);
int[] ends = new int[sen_ends.size()];
for (int i = 0; i < sen_ends.size(); i++) ends[i] = sen_ends.get(i);
return new Tokenization(surfs, offs, ends);
}
use of java.text.BreakIterator in project geode by apache.
the class LogWriterImpl method formatText.
static void formatText(PrintWriter writer, String target, int initialLength) {
BreakIterator boundary = BreakIterator.getLineInstance();
boundary.setText(target);
int start = boundary.first();
int end = boundary.next();
int lineLength = initialLength;
while (end != BreakIterator.DONE) {
// Look at the end and only accept whitespace breaks
char endChar = target.charAt(end - 1);
while (!Character.isWhitespace(endChar)) {
int lastEnd = end;
end = boundary.next();
if (end == BreakIterator.DONE) {
// give up. We are at the end of the string
end = lastEnd;
break;
}
endChar = target.charAt(end - 1);
}
int wordEnd = end;
if (endChar == '\n') {
// trim off the \n since println will do it for us
wordEnd--;
if (wordEnd > 0 && target.charAt(wordEnd - 1) == '\r') {
wordEnd--;
}
} else if (endChar == '\t') {
// figure tabs use 8 characters
lineLength += 7;
}
String word = target.substring(start, wordEnd);
lineLength += word.length();
writer.print(word);
if (endChar == '\n' || endChar == '\r') {
// force end of line
writer.println();
writer.print(" ");
lineLength = 2;
}
start = end;
end = boundary.next();
}
if (lineLength != 0) {
writer.println();
}
}
use of java.text.BreakIterator in project lucene-solr by apache.
the class BreakIteratorBoundaryScannerTest method testWordBoundary.
public void testWordBoundary() throws Exception {
StringBuilder text = new StringBuilder(TEXT);
BreakIterator bi = BreakIterator.getWordInstance(Locale.ROOT);
BoundaryScanner scanner = new BreakIteratorBoundaryScanner(bi);
int start = TEXT.indexOf("formance");
int expected = TEXT.indexOf("high-performance");
testFindStartOffset(text, start, expected, scanner);
expected = TEXT.indexOf(", full");
testFindEndOffset(text, start, expected, scanner);
}
Aggregations