use of edu.illinois.cs.cogcomp.nlp.tokenizer.Tokenizer.Tokenization in project cogcomp-nlp by CogComp.
the class BasicTextAnnotationBuilder method tokenizeTextSpan.
private static Tokenization tokenizeTextSpan(List<String[]> tokenizedSentences) {
List<String> tokensList = new ArrayList<>();
List<IntPair> charOffsetsList = new ArrayList<>();
int[] sentenceEndIndexes = new int[tokenizedSentences.size()];
int sentIndex = 0;
int sentStartTokOffset = 0;
int sentStartCharOffset = 0;
for (String[] sentenceTokens : tokenizedSentences) {
sentenceEndIndexes[sentIndex++] = sentStartTokOffset + sentenceTokens.length;
int tokenStartOffset = 0;
int nextSentStartCharOffset = 0;
for (String sentenceToken : sentenceTokens) {
tokensList.add(sentenceToken);
int tokenCharStart = sentStartCharOffset + tokenStartOffset;
int tokenCharEnd = tokenCharStart + sentenceToken.length();
IntPair translatedCharOffset = new IntPair(tokenCharStart, tokenCharEnd);
charOffsetsList.add(translatedCharOffset);
// The next token should start after a single space
tokenStartOffset += sentenceToken.length() + 1;
// by end of loop, this should match
nextSentStartCharOffset = tokenCharEnd + 1;
// start of next sentence
}
sentStartTokOffset += sentenceTokens.length;
sentStartCharOffset = nextSentStartCharOffset;
}
assert tokensList.size() == charOffsetsList.size();
String[] tokens = new String[tokensList.size()];
for (int i = 0; i < tokensList.size(); i++) tokens[i] = tokensList.get(i);
IntPair[] charOffsets = new IntPair[charOffsetsList.size()];
for (int i = 0; i < charOffsetsList.size(); i++) charOffsets[i] = charOffsetsList.get(i);
return new Tokenization(tokens, charOffsets, sentenceEndIndexes);
}
use of edu.illinois.cs.cogcomp.nlp.tokenizer.Tokenizer.Tokenization in project cogcomp-nlp by CogComp.
the class BasicTextAnnotationBuilder method createTextAnnotationFromTokens.
/**
* The default way to create a {@link TextAnnotation} from pre-tokenized text.
*
* @param tokenizedSentences A list of sentences, each one being a list of tokens
* @return A {@link TextAnnotation} containing the SENTENCE and TOKENS views.
*/
public static TextAnnotation createTextAnnotationFromTokens(String corpusId, String textId, List<String[]> tokenizedSentences) {
Tokenization tokenization = tokenizeTextSpan(tokenizedSentences);
String text = "";
for (String[] sentenceTokens : tokenizedSentences) text += StringUtils.join(sentenceTokens, ' ') + System.lineSeparator();
return new TextAnnotation(corpusId, textId, text, tokenization.getCharacterOffsets(), tokenization.getTokens(), tokenization.getSentenceEndTokenIndexes());
}
Aggregations