Search in sources :

Example 1 with State

use of edu.illinois.cs.cogcomp.nlp.tokenizer.TokenizerStateMachine.State in project cogcomp-nlp by CogComp.

the class StatefulTokenizer method tokenizeSentence.

@Override
public Pair<String[], IntPair[]> tokenizeSentence(String sentence) {
    // parse the test
    TokenizerStateMachine tsm = new TokenizerStateMachine(splitOnDash);
    tsm.parseText(sentence);
    // construct the data needed for the tokenization.
    int words = 0;
    for (State s : tsm.completed) {
        int idx = s.stateIndex();
        if (idx != TokenizerState.IN_SENTENCE.ordinal())
            words++;
    }
    IntPair[] wordOffsets = new IntPair[words];
    String[] tokens = new String[words];
    int wordIndex = 0;
    for (State s : tsm.completed) {
        State ms = (State) s;
        if (s.stateIndex() != TokenizerState.IN_SENTENCE.ordinal()) {
            tokens[wordIndex] = new String(tsm.text, ms.start, ms.end - ms.start);
            wordOffsets[wordIndex++] = new IntPair(ms.start, ms.end);
        }
    }
    return new Pair<>(tokens, wordOffsets);
}
Also used : State(edu.illinois.cs.cogcomp.nlp.tokenizer.TokenizerStateMachine.State) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair)

Example 2 with State

use of edu.illinois.cs.cogcomp.nlp.tokenizer.TokenizerStateMachine.State in project cogcomp-nlp by CogComp.

the class StatefulTokenizer method tokenizeTextSpan.

@Override
public Tokenization tokenizeTextSpan(String textSpan) {
    // parse the text
    TokenizerStateMachine tsm = new TokenizerStateMachine(splitOnDash);
    tsm.parseText(textSpan);
    // construct the data needed for the tokenization.
    int sentences = 0;
    int words = 0;
    for (State s : tsm.completed) {
        int idx = s.stateIndex();
        if (idx == TokenizerState.IN_SENTENCE.ordinal())
            sentences++;
        else
            words++;
    }
    IntPair[] wordOffsets = new IntPair[words];
    int[] sentenceEnds = new int[sentences];
    String[] tokens = new String[words];
    int sentenceIndex = 0;
    int wordIndex = 0;
    for (State s : tsm.completed) {
        State ms = (State) s;
        if (s.stateIndex() == TokenizerState.IN_SENTENCE.ordinal())
            sentenceEnds[sentenceIndex++] = wordIndex;
        else {
            tokens[wordIndex] = new String(tsm.text, ms.start, ms.end - ms.start);
            wordOffsets[wordIndex++] = new IntPair(ms.start, ms.end);
        }
    }
    // Removing any training sentence containing no words.
    if (sentences > 1)
        if (sentenceEnds[sentences - 1] == sentenceEnds[sentences - 2]) {
            int[] temp = new int[sentences - 1];
            System.arraycopy(sentenceEnds, 0, temp, 0, sentences - 1);
            sentenceEnds = temp;
        }
    return new Tokenization(tokens, wordOffsets, sentenceEnds);
}
Also used : State(edu.illinois.cs.cogcomp.nlp.tokenizer.TokenizerStateMachine.State) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair)

Aggregations

IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)2 State (edu.illinois.cs.cogcomp.nlp.tokenizer.TokenizerStateMachine.State)2 Pair (edu.illinois.cs.cogcomp.core.datastructures.Pair)1