use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedChild in project cogcomp-nlp by CogComp.
the class IllinoisTokenizer method tokenizeSentence.
/**
* given a sentence, return a set of tokens and their character offsets
*
* @param sentence the plain text sentence to tokenize
* @return an ordered list of tokens from the sentence, and an ordered list of their start and
* end character offsets (one-past-the-end indexing)
*/
@Override
public Pair<String[], IntPair[]> tokenizeSentence(String sentence) {
Sentence lbjSentence = new Sentence(sentence);
LinkedVector wordSplit = lbjSentence.wordSplit();
String[] output = new String[wordSplit.size()];
IntPair[] offsets = new IntPair[wordSplit.size()];
for (int i = 0; i < output.length; i++) {
LinkedChild linkedChild = wordSplit.get(i);
output[i] = linkedChild.toString();
offsets[i] = new IntPair(linkedChild.start, linkedChild.end + 1);
}
return new Pair<>(output, offsets);
}
Aggregations