use of edu.illinois.cs.cogcomp.nlp.tokenizer.TokenizerStateMachine.State in project cogcomp-nlp by CogComp.
the class StatefulTokenizer method tokenizeSentence.
@Override
public Pair<String[], IntPair[]> tokenizeSentence(String sentence) {
// parse the test
TokenizerStateMachine tsm = new TokenizerStateMachine(splitOnDash);
tsm.parseText(sentence);
// construct the data needed for the tokenization.
int words = 0;
for (State s : tsm.completed) {
int idx = s.stateIndex();
if (idx != TokenizerState.IN_SENTENCE.ordinal())
words++;
}
IntPair[] wordOffsets = new IntPair[words];
String[] tokens = new String[words];
int wordIndex = 0;
for (State s : tsm.completed) {
State ms = (State) s;
if (s.stateIndex() != TokenizerState.IN_SENTENCE.ordinal()) {
tokens[wordIndex] = new String(tsm.text, ms.start, ms.end - ms.start);
wordOffsets[wordIndex++] = new IntPair(ms.start, ms.end);
}
}
return new Pair<>(tokens, wordOffsets);
}
use of edu.illinois.cs.cogcomp.nlp.tokenizer.TokenizerStateMachine.State in project cogcomp-nlp by CogComp.
the class StatefulTokenizer method tokenizeTextSpan.
@Override
public Tokenization tokenizeTextSpan(String textSpan) {
// parse the text
TokenizerStateMachine tsm = new TokenizerStateMachine(splitOnDash);
tsm.parseText(textSpan);
// construct the data needed for the tokenization.
int sentences = 0;
int words = 0;
for (State s : tsm.completed) {
int idx = s.stateIndex();
if (idx == TokenizerState.IN_SENTENCE.ordinal())
sentences++;
else
words++;
}
IntPair[] wordOffsets = new IntPair[words];
int[] sentenceEnds = new int[sentences];
String[] tokens = new String[words];
int sentenceIndex = 0;
int wordIndex = 0;
for (State s : tsm.completed) {
State ms = (State) s;
if (s.stateIndex() == TokenizerState.IN_SENTENCE.ordinal())
sentenceEnds[sentenceIndex++] = wordIndex;
else {
tokens[wordIndex] = new String(tsm.text, ms.start, ms.end - ms.start);
wordOffsets[wordIndex++] = new IntPair(ms.start, ms.end);
}
}
// Removing any training sentence containing no words.
if (sentences > 1)
if (sentenceEnds[sentences - 1] == sentenceEnds[sentences - 2]) {
int[] temp = new int[sentences - 1];
System.arraycopy(sentenceEnds, 0, temp, 0, sentences - 1);
sentenceEnds = temp;
}
return new Tokenization(tokens, wordOffsets, sentenceEnds);
}