use of com.joliciel.talismane.tokeniser.filters.TokenFilter in project talismane by joliciel-informatique.
the class Tokeniser method tokeniseWithDecisions.
/**
* Similar to {@link #tokeniseWithDecisions(String)}, but the text to be
* tokenised is contained within a Sentence object.
*
* @param sentence
* the sentence to tokenise
* @param labels
* the labels to add to any annotations added.
* @throws IOException
*/
public List<TokenisedAtomicTokenSequence> tokeniseWithDecisions(Sentence sentence, String... labels) throws TalismaneException, IOException {
// Initially, separate the sentence into tokens using the separators
// provided
TokenSequence tokenSequence = new TokenSequence(sentence, this.sessionId);
tokenSequence.findDefaultTokens();
List<TokenisedAtomicTokenSequence> sequences = this.tokeniseInternal(tokenSequence, sentence);
LOG.debug("####Final token sequences:");
int j = 1;
for (TokenisedAtomicTokenSequence sequence : sequences) {
TokenSequence newTokenSequence = sequence.inferTokenSequence();
for (TokenFilter filter : filters) filter.apply(newTokenSequence);
if (j == 1) {
// add annotations for the very first token sequence
List<Annotation<TokenBoundary>> tokenBoundaries = new ArrayList<>();
for (Token token : newTokenSequence) {
Annotation<TokenBoundary> tokenBoundary = new Annotation<>(token.getStartIndex(), token.getEndIndex(), new TokenBoundary(token.getText(), token.getAnalyisText(), token.getAttributes()), labels);
tokenBoundaries.add(tokenBoundary);
}
sentence.addAnnotations(tokenBoundaries);
}
if (LOG.isDebugEnabled()) {
LOG.debug("Token sequence " + j);
LOG.debug("Atomic sequence: " + sequence);
LOG.debug("Resulting sequence: " + newTokenSequence);
}
j++;
}
return sequences;
}
use of com.joliciel.talismane.tokeniser.filters.TokenFilter in project talismane by joliciel-informatique.
the class TokenRegexBasedCorpusReader method processSentence.
@Override
protected void processSentence(Sentence sentence, List<CorpusLine> corpusLines) throws TalismaneException, IOException {
try {
super.processSentence(sentence, corpusLines);
tokenSequence = new PretokenisedSequence(sentence, sessionId);
for (CorpusLine corpusLine : corpusLines) {
this.convertToToken(tokenSequence, corpusLine);
}
for (TokenFilter filter : filters) filter.apply(tokenSequence);
tokenSequence.cleanSlate();
} catch (TalismaneException e) {
this.clearSentence();
throw e;
}
}
Aggregations