use of com.joliciel.talismane.Annotation in project talismane by joliciel-informatique.
the class SentenceDetector method detectSentences.
/**
* Detect sentences within an annotated text. Sentences are added in the form
* of an Annotation around a {@link SentenceBoundary}, with the start position
* (relative to the start of the annotated text) at the start of the sentence
* and the end position immediately after the end of the sentence. <br>
* <br>
* Sentence boundaries will not be detected within any annotation of type
* {@link RawTextNoSentenceBreakMarker}, nor will they be detected before or
* after the {@link AnnotatedText#getAnalysisStart()} and
* {@link AnnotatedText#getAnalysisEnd()} respectively. <br>
* <br>
* If the text contained existing {@link SentenceBoundary} annotations before
* analysis start, the first sentence will begin where the last existing
* annotation ended. Otherwise, the first boundary will begin at position 0.
* <br>
* <br>
* If the text's analysis end is equal to the text length, it is assumed that
* the text end is a sentence boundary. In this case, an additional sentence
* is added starting at the final detected boundary and ending at text end.
*
* @param text
* the annotated text in which we need to detect sentences.
* @return in addition to the annotations added, we return a List of integers
* marking the end position of each sentence boundary.
*/
public List<Integer> detectSentences(AnnotatedText text, String... labels) throws TalismaneException {
LOG.debug("detectSentences");
List<Annotation<RawTextNoSentenceBreakMarker>> noSentenceBreakMarkers = text.getAnnotations(RawTextNoSentenceBreakMarker.class);
Matcher matcher = possibleBoundaryPattern.matcher(text.getText());
List<Integer> possibleBoundaries = new ArrayList<>();
while (matcher.find()) {
if (matcher.start() >= text.getAnalysisStart() && matcher.start() < text.getAnalysisEnd()) {
boolean noSentences = false;
int position = matcher.start();
for (Annotation<RawTextNoSentenceBreakMarker> noSentenceBreakMarker : noSentenceBreakMarkers) {
if (noSentenceBreakMarker.getStart() <= position && position < noSentenceBreakMarker.getEnd()) {
noSentences = true;
break;
}
}
if (!noSentences)
possibleBoundaries.add(position);
}
}
// collect all deterministic sentence boundaries
List<Annotation<RawTextSentenceBreakMarker>> sentenceBreakMarkers = text.getAnnotations(RawTextSentenceBreakMarker.class);
Set<Integer> guessedBoundaries = new TreeSet<>(sentenceBreakMarkers.stream().filter(f -> f.getEnd() >= text.getAnalysisStart()).map(f -> f.getEnd()).collect(Collectors.toList()));
// Share one token sequence for all possible boundaries, to avoid tokenising
// multiple times
Sentence sentence = new Sentence(text.getText(), sessionId);
TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
List<PossibleSentenceBoundary> boundaries = new ArrayList<>();
for (int possibleBoundary : possibleBoundaries) {
PossibleSentenceBoundary boundary = new PossibleSentenceBoundary(tokenSequence, possibleBoundary);
if (LOG.isTraceEnabled()) {
LOG.trace("Testing boundary: " + boundary);
LOG.trace(" at position: " + possibleBoundary);
}
List<FeatureResult<?>> featureResults = new ArrayList<>();
for (SentenceDetectorFeature<?> feature : features) {
RuntimeEnvironment env = new RuntimeEnvironment();
FeatureResult<?> featureResult = feature.check(boundary, env);
if (featureResult != null)
featureResults.add(featureResult);
}
if (LOG.isTraceEnabled()) {
SortedSet<String> featureResultSet = featureResults.stream().map(f -> f.toString()).collect(Collectors.toCollection(() -> new TreeSet<String>()));
for (String featureResultString : featureResultSet) {
LOG.trace(featureResultString);
}
}
List<Decision> decisions = this.decisionMaker.decide(featureResults);
if (LOG.isTraceEnabled()) {
for (Decision decision : decisions) {
LOG.trace(decision.getOutcome() + ": " + decision.getProbability());
}
}
if (decisions.get(0).getOutcome().equals(SentenceDetectorOutcome.IS_BOUNDARY.name())) {
if (LOG.isTraceEnabled()) {
LOG.trace("Adding boundary: " + possibleBoundary + 1);
}
guessedBoundaries.add(possibleBoundary + 1);
boundaries.add(boundary);
}
}
if (LOG.isTraceEnabled()) {
LOG.trace("context: " + text.getText().toString().replace('\n', '¶').replace('\r', '¶'));
for (PossibleSentenceBoundary boundary : boundaries) LOG.trace("boundary: " + boundary.toString());
}
if (LOG.isDebugEnabled())
LOG.debug("guessedBoundaries : " + guessedBoundaries.toString());
List<Annotation<SentenceBoundary>> newBoundaries = new ArrayList<>();
int lastBoundary = 0;
List<Annotation<SentenceBoundary>> existingBoundaries = text.getAnnotations(SentenceBoundary.class);
if (existingBoundaries.size() > 0) {
lastBoundary = existingBoundaries.get(existingBoundaries.size() - 1).getEnd();
}
// advance boundary start until a non space character is encountered
while (lastBoundary < text.getAnalysisEnd() && Character.isWhitespace(text.getText().charAt(lastBoundary))) {
lastBoundary++;
}
for (int guessedBoundary : guessedBoundaries) {
if (guessedBoundary > lastBoundary) {
Annotation<SentenceBoundary> sentenceBoundary = new Annotation<>(lastBoundary, guessedBoundary, new SentenceBoundary(), labels);
newBoundaries.add(sentenceBoundary);
if (LOG.isTraceEnabled()) {
LOG.trace("Added boundary: " + sentenceBoundary);
}
lastBoundary = guessedBoundary;
}
}
if (text.getAnalysisEnd() == text.getText().length()) {
if (text.getAnalysisEnd() > lastBoundary) {
Annotation<SentenceBoundary> sentenceBoundary = new Annotation<>(lastBoundary, text.getAnalysisEnd(), new SentenceBoundary(), labels);
newBoundaries.add(sentenceBoundary);
if (LOG.isTraceEnabled()) {
LOG.trace("Added final boundary: " + sentenceBoundary);
}
}
}
text.addAnnotations(newBoundaries);
return new ArrayList<>(guessedBoundaries);
}
use of com.joliciel.talismane.Annotation in project talismane by joliciel-informatique.
the class RawTextProcessor method processText.
/**
* Processes the current text based on annotations added to block 3, and
* returns a SentenceHolder.
*
* @return SentenceHolder to retrieve the sentences.
*/
protected final SentenceHolder processText(int textStartPos, int textEndPos, CharSequence rawText, boolean finalBlock) {
if (this.sentenceHolder != null)
return this.sentenceHolder;
LOG.debug("processText");
List<Annotation<RawTextMarker>> annotations = this.getAnnotations(RawTextMarker.class);
if (LOG.isTraceEnabled()) {
LOG.trace("finalBlock? " + finalBlock);
LOG.trace("annotations: " + annotations.toString());
}
Map<Integer, List<Pair<Boolean, Annotation<RawTextMarker>>>> markMap = new TreeMap<>();
for (Annotation<RawTextMarker> annotation : annotations) {
if (LOG.isTraceEnabled())
LOG.trace("Annotation: " + annotation.toString());
// START_MARK processing mark.
if (annotation.getStart() >= textStartPos && annotation.getStart() < textEndPos && !annotation.hasProcessingMark(START_MARK)) {
if (LOG.isTraceEnabled()) {
LOG.trace("Start in range: textStartPos " + textStartPos + ">= matcherStart [[" + annotation.getStart() + "]] < textEndPos " + textEndPos + ", start applied? " + annotation.hasProcessingMark(START_MARK));
}
annotation.addProcessingMark(START_MARK);
List<Pair<Boolean, Annotation<RawTextMarker>>> startMarks = markMap.get(annotation.getStart());
if (startMarks == null) {
startMarks = new ArrayList<>();
markMap.put(annotation.getStart(), startMarks);
}
startMarks.add(new ImmutablePair<Boolean, Annotation<RawTextMarker>>(true, annotation));
} else {
if (LOG.isTraceEnabled()) {
LOG.trace("Start out of range: textStartPos " + textStartPos + ">= matcherStart [[" + annotation.getStart() + "]] < textEndPos " + textEndPos + ", start applied? " + annotation.hasProcessingMark(START_MARK));
}
}
// END_MARK processing mark.
if (annotation.getEnd() >= textStartPos && (annotation.getEnd() < textEndPos || (annotation.getEnd() == textEndPos && rawText.length() > 0 && finalBlock)) && !annotation.hasProcessingMark(END_MARK)) {
if (LOG.isTraceEnabled()) {
LOG.trace("End in range: textStartPos " + textStartPos + ">= matcherEnd [[" + annotation.getEnd() + "]] < textEndPos " + textEndPos + ", finalBlock? " + finalBlock + ", end applied? " + annotation.hasProcessingMark(END_MARK));
}
annotation.addProcessingMark(END_MARK);
List<Pair<Boolean, Annotation<RawTextMarker>>> endMarks = markMap.get(annotation.getEnd());
if (endMarks == null) {
endMarks = new ArrayList<>();
markMap.put(annotation.getEnd(), endMarks);
}
endMarks.add(new ImmutablePair<Boolean, Annotation<RawTextMarker>>(false, annotation));
} else {
if (LOG.isTraceEnabled()) {
LOG.trace("End out of range: textStartPos " + textStartPos + ">= matcherEnd [[" + annotation.getEnd() + "]] < textEndPos " + textEndPos + ", finalBlock? " + finalBlock + ", end applied? " + annotation.hasProcessingMark(END_MARK));
}
}
}
if (LOG.isTraceEnabled()) {
LOG.trace("currentText: " + rawText.toString().replace('\n', '¶').replace('\r', '¶'));
LOG.trace("marks: " + markMap.toString());
}
SentenceHolder sentenceHolder = new SentenceHolder(originalIndexProcessed, finalBlock, sessionId);
// find any newlines
sentenceHolder.addNewline(leftoverNewline, lineNumber - 1);
Matcher matcher = newlinePattern.matcher(rawText);
while (matcher.find()) {
sentenceHolder.addNewline(originalIndexProcessed + matcher.end(), lineNumber++);
leftoverNewline = originalIndexProcessed + matcher.end();
}
Map<Integer, Integer> insertionPoints = new TreeMap<Integer, Integer>();
StringBuilder processedText = new StringBuilder();
int currentPos = 0;
int outputPos = 0;
for (int markPos : markMap.keySet()) {
List<Pair<Boolean, Annotation<RawTextMarker>>> marks = markMap.get(markPos);
// collect all instructions at a given position
List<Triple<RawTextInstruction, Annotation<RawTextMarker>, Boolean>> instructions = new ArrayList<>();
for (Pair<Boolean, Annotation<RawTextMarker>> mark : marks) {
boolean isStart = mark.getLeft();
Annotation<RawTextMarker> annotation = mark.getRight();
RawTextMarker marker = annotation.getData();
List<RawTextInstruction> actions = new ArrayList<>();
if (isStart) {
switch(marker.getType()) {
case SKIP:
actions.add(RawTextInstruction.PUSH_SKIP);
break;
case SENTENCE_BREAK:
break;
case SPACE:
actions.add(RawTextInstruction.INSERT);
actions.add(RawTextInstruction.PUSH_SKIP);
break;
case REPLACE:
actions.add(RawTextInstruction.INSERT);
actions.add(RawTextInstruction.PUSH_SKIP);
break;
case OUTPUT:
actions.add(RawTextInstruction.PUSH_OUTPUT);
actions.add(RawTextInstruction.PUSH_SKIP);
break;
case INCLUDE:
actions.add(RawTextInstruction.PUSH_INCLUDE);
break;
case OUTPUT_START:
actions.add(RawTextInstruction.PUSH_OUTPUT);
break;
case STOP:
actions.add(RawTextInstruction.STOP);
break;
case NO_SENTENCE_BREAK:
case NONE:
case OUTPUT_STOP:
case START:
case TAG:
break;
}
} else {
// end of annotation
switch(marker.getType()) {
case SKIP:
case SPACE:
case REPLACE:
actions.add(RawTextInstruction.POP_SKIP);
break;
case SENTENCE_BREAK:
actions.add(RawTextInstruction.SENTENCE_BREAK);
break;
case OUTPUT:
actions.add(RawTextInstruction.STOP_OUTPUT);
actions.add(RawTextInstruction.POP_SKIP);
break;
case INCLUDE:
actions.add(RawTextInstruction.POP_INCLUDE);
break;
case START:
actions.add(RawTextInstruction.START);
break;
case OUTPUT_STOP:
actions.add(RawTextInstruction.STOP_OUTPUT);
break;
case NO_SENTENCE_BREAK:
case NONE:
case OUTPUT_START:
case STOP:
case TAG:
break;
}
}
for (RawTextInstruction action : actions) {
instructions.add(new ImmutableTriple<>(action, annotation, isStart));
}
}
// sort the instructions to ensure they're applied in the correct
// order
instructions = instructions.stream().sorted((i1, i2) -> i1.getLeft().compareTo(i2.getLeft())).collect(Collectors.toList());
for (Triple<RawTextInstruction, Annotation<RawTextMarker>, Boolean> triple : instructions) {
RawTextInstruction instruction = triple.getLeft();
Annotation<RawTextMarker> annotation = triple.getMiddle();
RawTextMarker marker = annotation.getData();
boolean isStart = triple.getRight();
int position = isStart ? annotation.getStart() : annotation.getEnd();
int relativePosition = position - textStartPos;
if (LOG.isTraceEnabled()) {
LOG.trace((isStart ? "Start " : "Stop ") + marker.getType() + " at " + position + ", relative pos: " + relativePosition);
LOG.trace("instruction: " + instruction);
LOG.trace("Stack before: " + shouldProcessStack);
LOG.trace("Text before: " + processedText.toString());
LOG.trace("Added by filter: " + marker.getSource());
LOG.trace("Match text: " + this.getText().subSequence(annotation.getStart(), annotation.getEnd()).toString().replace('\n', '¶').replace('\r', '¶'));
}
boolean shouldProcess = shouldProcessStack.peek();
boolean shouldOutput = shouldOutputStack.peek();
switch(instruction) {
case PUSH_SKIP:
if (shouldProcess) {
insertionPoints.put(processedText.length(), currentPos);
processedText.append(rawText.subSequence(currentPos, relativePosition));
if (shouldOutput) {
outputPos = relativePosition;
}
}
shouldProcessStack.push(false);
break;
case PUSH_OUTPUT:
if (!shouldOutput && !shouldProcess) {
outputPos = relativePosition;
}
shouldOutputStack.push(true);
break;
case PUSH_INCLUDE:
if (!shouldProcess) {
currentPos = relativePosition;
if (shouldOutput) {
CharSequence outputText = rawText.subSequence(outputPos, relativePosition);
this.addOutputText(sentenceHolder, processedText.length(), outputText);
outputPos = relativePosition;
}
}
shouldProcessStack.push(true);
break;
case SPACE:
if (shouldProcess) {
insertionPoints.put(processedText.length(), currentPos);
CharSequence leftoverText = rawText.subSequence(currentPos, relativePosition);
processedText.append(leftoverText);
currentPos = relativePosition;
if (leftoverText.length() > 0 && leftoverText.charAt(leftoverText.length() - 1) != ' ') {
insertionPoints.put(processedText.length(), currentPos);
processedText.append(" ");
}
}
break;
case INSERT:
if (shouldProcess) {
insertionPoints.put(processedText.length(), currentPos);
CharSequence leftoverText = rawText.subSequence(currentPos, relativePosition);
processedText.append(leftoverText);
currentPos = relativePosition;
for (int i = 0; i < marker.getInsertionText().length(); i++) {
insertionPoints.put(processedText.length() + i, currentPos);
}
if (LOG.isTraceEnabled())
LOG.trace("Inserting: " + marker.getInsertionText());
processedText.append(marker.getInsertionText());
}
break;
case SENTENCE_BREAK:
{
if (shouldProcess) {
insertionPoints.put(processedText.length(), currentPos);
CharSequence leftoverText = rawText.subSequence(currentPos, relativePosition);
processedText.append(leftoverText);
currentPos = relativePosition;
}
// add the sentence boundary after the last character that
// was added
sentenceHolder.addSentenceBoundary(processedText.length());
if (LOG.isTraceEnabled()) {
int boundary = processedText.length() - 1;
if (boundary >= 0) {
String string = null;
int start1 = boundary - NUM_CHARS;
if (start1 < 0)
start1 = 0;
String startString = processedText.subSequence(start1, boundary).toString();
String middleString = "" + processedText.charAt(boundary);
string = startString + "[" + middleString + "]";
string = string.replace('\n', '¶');
LOG.trace("Adding sentence break at position " + boundary + ": " + string);
}
}
if (shouldProcess) {
if (processedText.length() > 0 && processedText.charAt(processedText.length() - 1) != ' ') {
insertionPoints.put(processedText.length(), currentPos);
processedText.append(" ");
}
}
break;
}
case POP_SKIP:
case POP_INCLUDE:
case STOP:
case START:
{
boolean wasProcessing = shouldProcess;
boolean wasOutputting = shouldOutput && !shouldProcess;
if (instruction == RawTextInstruction.POP_SKIP || instruction == RawTextInstruction.POP_INCLUDE) {
shouldProcessStack.pop();
} else if (instruction == RawTextInstruction.STOP) {
shouldProcessStack.pop();
shouldProcessStack.push(false);
} else if (instruction == RawTextInstruction.START) {
shouldProcessStack.pop();
shouldProcessStack.push(true);
}
shouldProcess = shouldProcessStack.peek();
shouldOutput = shouldOutput && !shouldProcess;
if (wasProcessing && !shouldProcess) {
insertionPoints.put(processedText.length(), currentPos);
processedText.append(rawText.subSequence(currentPos, relativePosition));
} else if (!wasProcessing && shouldProcess) {
currentPos = relativePosition;
}
if (wasOutputting && (!shouldOutput || !shouldProcess)) {
CharSequence outputText = rawText.subSequence(outputPos, relativePosition);
this.addOutputText(sentenceHolder, processedText.length(), outputText);
outputPos = relativePosition;
} else if (!wasOutputting && (shouldOutput && !shouldProcess)) {
outputPos = relativePosition;
}
// shouldOutput?
break;
}
case POP_OUTPUT:
case STOP_OUTPUT:
case START_OUTPUT:
{
boolean wasOutputting = shouldOutput && !shouldProcess;
if (instruction == RawTextInstruction.POP_OUTPUT) {
shouldOutputStack.pop();
} else if (instruction == RawTextInstruction.STOP_OUTPUT) {
shouldOutputStack.pop();
shouldOutputStack.push(false);
} else if (instruction == RawTextInstruction.START_OUTPUT) {
shouldOutputStack.pop();
shouldOutputStack.push(true);
}
shouldOutput = shouldOutputStack.peek();
if (wasOutputting && (!shouldOutput || !shouldProcess)) {
CharSequence outputText = rawText.subSequence(outputPos, relativePosition);
this.addOutputText(sentenceHolder, processedText.length(), outputText);
outputPos = relativePosition;
} else if (!wasOutputting && (shouldOutput && !shouldProcess)) {
outputPos = relativePosition;
}
// shouldOutput?
break;
}
}
if (LOG.isTraceEnabled()) {
LOG.trace("Stack after: " + shouldProcessStack);
LOG.trace("Text after: " + processedText.toString());
}
}
// next action
}
// next text marker
boolean shouldProcess = shouldProcessStack.peek();
boolean shouldOutput = shouldOutputStack.peek();
if (shouldProcess) {
insertionPoints.put(processedText.length(), currentPos);
processedText.append(rawText.subSequence(currentPos, rawText.length()));
}
if (shouldOutput && !shouldProcess) {
leftoverOutput = leftoverOutput + rawText.subSequence(outputPos, rawText.length());
}
String finalProcessedText = processedText.toString();
if (LOG.isTraceEnabled())
LOG.trace("Text after processing: " + finalProcessedText);
sentenceHolder.setProcessedText(finalProcessedText);
int lastIndex = 0;
int lastOriginalIndex = 0;
for (Entry<Integer, Integer> insertionPoint : insertionPoints.entrySet()) {
int j = 0;
for (int i = lastIndex; i < insertionPoint.getKey(); i++) {
sentenceHolder.addOriginalIndex(originalIndexProcessed + lastOriginalIndex + j);
j++;
}
lastIndex = insertionPoint.getKey();
lastOriginalIndex = insertionPoint.getValue();
}
if (lastIndex < sentenceHolder.getProcessedText().length()) {
int j = 0;
for (int i = lastIndex; i < sentenceHolder.getProcessedText().length(); i++) {
sentenceHolder.addOriginalIndex(originalIndexProcessed + lastOriginalIndex + j);
j++;
}
}
originalIndexProcessed += rawText.length();
this.sentenceHolder = sentenceHolder;
return sentenceHolder;
}
use of com.joliciel.talismane.Annotation in project talismane by joliciel-informatique.
the class RawTextProcessor method getDetectedSentences.
/**
* Get a list of sentences currently detected. All sentences will be complete
* - if the list ends with an incomplete sentence it is kept for another
* round.
*
* @return
*/
public final List<Sentence> getDetectedSentences() {
SentenceHolder prevHolder = this.getPreviousSentenceHolder();
SentenceHolder currentHolder = this.getCurrentSentenceHolder();
for (Annotation<SentenceBoundary> sentenceBoundary : sentenceBoundaries) {
currentHolder.addSentenceBoundary(sentenceBoundary.getStart() - prevHolder.getProcessedText().length());
currentHolder.addSentenceBoundary(sentenceBoundary.getEnd() - prevHolder.getProcessedText().length());
}
List<Sentence> sentences = currentHolder.getDetectedSentences(leftover);
leftover = null;
if (sentences.size() > 0) {
Sentence lastSentence = sentences.get(sentences.size() - 1);
if (!lastSentence.isComplete()) {
leftover = lastSentence;
if (LOG.isTraceEnabled())
LOG.trace("Set leftover to: " + leftover.toString());
sentences.remove(sentences.size() - 1);
}
}
// ensure that sentence annotations get added to the raw text as well
for (Sentence sentence : sentences) {
sentence.addObserver(new AnnotationObserver() {
int myOrigin = RawTextProcessor.this.originalStartIndex;
@Override
public <T extends Serializable> void beforeAddAnnotations(AnnotatedText subject, List<Annotation<T>> annotations) {
List<Annotation<T>> newAnnotations = new ArrayList<>();
for (Annotation<T> annotation : annotations) {
int originalStart = sentence.getOriginalIndex(annotation.getStart());
int originalEnd = sentence.getOriginalIndex(annotation.getEnd());
Annotation<T> newAnnotation = annotation.getAnnotation(originalStart - myOrigin, originalEnd - myOrigin);
newAnnotations.add(newAnnotation);
}
RawTextProcessor.this.addAnnotations(newAnnotations);
}
@Override
public <T extends Serializable> void afterAddAnnotations(AnnotatedText subject) {
}
});
}
// position 0.
if (currentHolder.getOriginalTextSegments().size() > 0) {
if (leftover == null) {
leftover = new Sentence("", currentFile, sessionId);
}
StringBuilder segmentsToInsert = new StringBuilder();
if (leftover.getLeftoverOriginalText().length() > 0)
segmentsToInsert.append(TalismaneSession.get(sessionId).getOutputDivider());
for (String originalTextSegment : currentHolder.getOriginalTextSegments().values()) {
segmentsToInsert.append(originalTextSegment);
}
leftover.setLeftoverOriginalText(leftover.getLeftoverOriginalText() + segmentsToInsert.toString());
}
return sentences;
}
use of com.joliciel.talismane.Annotation in project talismane by joliciel-informatique.
the class RollingTextBlock method getRawTextBlock.
/**
* Get a raw text block for annotation by filters. This covers blocks 3 and 4
* only of the current RollingTextBlock, with analysis end at the end of
* block3. It is assumed that annotations crossing block 2 and 3 were already
* added by a predecessor.
*/
public AnnotatedText getRawTextBlock() {
AnnotatedText rawTextBlock = new AnnotatedText(this.block3 + this.block4, 0, this.block3.length());
rawTextBlock.addObserver(new AnnotationObserver() {
@Override
public <T extends Serializable> void beforeAddAnnotations(AnnotatedText subject, List<Annotation<T>> annotations) {
if (annotations.size() > 0) {
int offset = RollingTextBlock.this.block1.length() + RollingTextBlock.this.block2.length();
List<Annotation<T>> newAnnotations = new ArrayList<>();
for (Annotation<T> annotation : annotations) {
Annotation<T> newAnnotation = annotation.getAnnotation(annotation.getStart() + offset, annotation.getEnd() + offset);
newAnnotations.add(newAnnotation);
}
RollingTextBlock.this.addAnnotations(newAnnotations);
if (LOG.isTraceEnabled()) {
LOG.trace("RawTextBlock Annotations received: " + annotations);
LOG.trace("RawTextBlock Annotations added: " + newAnnotations);
}
}
}
@Override
public <T extends Serializable> void afterAddAnnotations(AnnotatedText subject) {
}
});
return rawTextBlock;
}
use of com.joliciel.talismane.Annotation in project talismane by joliciel-informatique.
the class Tokeniser method tokeniseWithDecisions.
/**
* Similar to {@link #tokeniseWithDecisions(String)}, but the text to be
* tokenised is contained within a Sentence object.
*
* @param sentence
* the sentence to tokenise
* @param labels
* the labels to add to any annotations added.
* @throws IOException
*/
public List<TokenisedAtomicTokenSequence> tokeniseWithDecisions(Sentence sentence, String... labels) throws TalismaneException, IOException {
// Initially, separate the sentence into tokens using the separators
// provided
TokenSequence tokenSequence = new TokenSequence(sentence, this.sessionId);
tokenSequence.findDefaultTokens();
List<TokenisedAtomicTokenSequence> sequences = this.tokeniseInternal(tokenSequence, sentence);
LOG.debug("####Final token sequences:");
int j = 1;
for (TokenisedAtomicTokenSequence sequence : sequences) {
TokenSequence newTokenSequence = sequence.inferTokenSequence();
for (TokenFilter filter : filters) filter.apply(newTokenSequence);
if (j == 1) {
// add annotations for the very first token sequence
List<Annotation<TokenBoundary>> tokenBoundaries = new ArrayList<>();
for (Token token : newTokenSequence) {
Annotation<TokenBoundary> tokenBoundary = new Annotation<>(token.getStartIndex(), token.getEndIndex(), new TokenBoundary(token.getText(), token.getAnalyisText(), token.getAttributes()), labels);
tokenBoundaries.add(tokenBoundary);
}
sentence.addAnnotations(tokenBoundaries);
}
if (LOG.isDebugEnabled()) {
LOG.debug("Token sequence " + j);
LOG.debug("Atomic sequence: " + sequence);
LOG.debug("Resulting sequence: " + newTokenSequence);
}
j++;
}
return sequences;
}
Aggregations