Search in sources :

Example 1 with Tokenization

use of edu.illinois.cs.cogcomp.nlp.tokenizer.Tokenizer.Tokenization in project cogcomp-nlp by CogComp.

the class BasicTextAnnotationBuilder method tokenizeTextSpan.

private static Tokenization tokenizeTextSpan(List<String[]> tokenizedSentences) {
    List<String> tokensList = new ArrayList<>();
    List<IntPair> charOffsetsList = new ArrayList<>();
    int[] sentenceEndIndexes = new int[tokenizedSentences.size()];
    int sentIndex = 0;
    int sentStartTokOffset = 0;
    int sentStartCharOffset = 0;
    for (String[] sentenceTokens : tokenizedSentences) {
        sentenceEndIndexes[sentIndex++] = sentStartTokOffset + sentenceTokens.length;
        int tokenStartOffset = 0;
        int nextSentStartCharOffset = 0;
        for (String sentenceToken : sentenceTokens) {
            tokensList.add(sentenceToken);
            int tokenCharStart = sentStartCharOffset + tokenStartOffset;
            int tokenCharEnd = tokenCharStart + sentenceToken.length();
            IntPair translatedCharOffset = new IntPair(tokenCharStart, tokenCharEnd);
            charOffsetsList.add(translatedCharOffset);
            // The next token should start after a single space
            tokenStartOffset += sentenceToken.length() + 1;
            // by end of loop, this should match
            nextSentStartCharOffset = tokenCharEnd + 1;
        // start of next sentence
        }
        sentStartTokOffset += sentenceTokens.length;
        sentStartCharOffset = nextSentStartCharOffset;
    }
    assert tokensList.size() == charOffsetsList.size();
    String[] tokens = new String[tokensList.size()];
    for (int i = 0; i < tokensList.size(); i++) tokens[i] = tokensList.get(i);
    IntPair[] charOffsets = new IntPair[charOffsetsList.size()];
    for (int i = 0; i < charOffsetsList.size(); i++) charOffsets[i] = charOffsetsList.get(i);
    return new Tokenization(tokens, charOffsets, sentenceEndIndexes);
}
Also used : ArrayList(java.util.ArrayList) Tokenization(edu.illinois.cs.cogcomp.nlp.tokenizer.Tokenizer.Tokenization) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair)

Example 2 with Tokenization

use of edu.illinois.cs.cogcomp.nlp.tokenizer.Tokenizer.Tokenization in project cogcomp-nlp by CogComp.

the class BasicTextAnnotationBuilder method createTextAnnotationFromTokens.

/**
     * The default way to create a {@link TextAnnotation} from pre-tokenized text.
     * 
     * @param tokenizedSentences A list of sentences, each one being a list of tokens
     * @return A {@link TextAnnotation} containing the SENTENCE and TOKENS views.
     */
public static TextAnnotation createTextAnnotationFromTokens(String corpusId, String textId, List<String[]> tokenizedSentences) {
    Tokenization tokenization = tokenizeTextSpan(tokenizedSentences);
    String text = "";
    for (String[] sentenceTokens : tokenizedSentences) text += StringUtils.join(sentenceTokens, ' ') + System.lineSeparator();
    return new TextAnnotation(corpusId, textId, text, tokenization.getCharacterOffsets(), tokenization.getTokens(), tokenization.getSentenceEndTokenIndexes());
}
Also used : Tokenization(edu.illinois.cs.cogcomp.nlp.tokenizer.Tokenizer.Tokenization) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)

Aggregations

Tokenization (edu.illinois.cs.cogcomp.nlp.tokenizer.Tokenizer.Tokenization)2 IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)1 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)1 ArrayList (java.util.ArrayList)1