use of com.joliciel.talismane.AnnotatedText in project talismane by joliciel-informatique.
the class SentenceDetector method detectSentences.
/**
* Detect sentences within an annotated text. Sentences are added in the form
* of an Annotation around a {@link SentenceBoundary}, with the start position
* (relative to the start of the annotated text) at the start of the sentence
* and the end position immediately after the end of the sentence. <br>
* <br>
* Sentence boundaries will not be detected within any annotation of type
* {@link RawTextNoSentenceBreakMarker}, nor will they be detected before or
* after the {@link AnnotatedText#getAnalysisStart()} and
* {@link AnnotatedText#getAnalysisEnd()} respectively. <br>
* <br>
* If the text contained existing {@link SentenceBoundary} annotations before
* analysis start, the first sentence will begin where the last existing
* annotation ended. Otherwise, the first boundary will begin at position 0.
* <br>
* <br>
* If the text's analysis end is equal to the text length, it is assumed that
* the text end is a sentence boundary. In this case, an additional sentence
* is added starting at the final detected boundary and ending at text end.
*
* @param text
* the annotated text in which we need to detect sentences.
* @return in addition to the annotations added, we return a List of integers
* marking the end position of each sentence boundary.
*/
public List<Integer> detectSentences(AnnotatedText text, String... labels) throws TalismaneException {
LOG.debug("detectSentences");
List<Annotation<RawTextNoSentenceBreakMarker>> noSentenceBreakMarkers = text.getAnnotations(RawTextNoSentenceBreakMarker.class);
Matcher matcher = possibleBoundaryPattern.matcher(text.getText());
List<Integer> possibleBoundaries = new ArrayList<>();
while (matcher.find()) {
if (matcher.start() >= text.getAnalysisStart() && matcher.start() < text.getAnalysisEnd()) {
boolean noSentences = false;
int position = matcher.start();
for (Annotation<RawTextNoSentenceBreakMarker> noSentenceBreakMarker : noSentenceBreakMarkers) {
if (noSentenceBreakMarker.getStart() <= position && position < noSentenceBreakMarker.getEnd()) {
noSentences = true;
break;
}
}
if (!noSentences)
possibleBoundaries.add(position);
}
}
// collect all deterministic sentence boundaries
List<Annotation<RawTextSentenceBreakMarker>> sentenceBreakMarkers = text.getAnnotations(RawTextSentenceBreakMarker.class);
Set<Integer> guessedBoundaries = new TreeSet<>(sentenceBreakMarkers.stream().filter(f -> f.getEnd() >= text.getAnalysisStart()).map(f -> f.getEnd()).collect(Collectors.toList()));
// Share one token sequence for all possible boundaries, to avoid tokenising
// multiple times
Sentence sentence = new Sentence(text.getText(), sessionId);
TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
List<PossibleSentenceBoundary> boundaries = new ArrayList<>();
for (int possibleBoundary : possibleBoundaries) {
PossibleSentenceBoundary boundary = new PossibleSentenceBoundary(tokenSequence, possibleBoundary);
if (LOG.isTraceEnabled()) {
LOG.trace("Testing boundary: " + boundary);
LOG.trace(" at position: " + possibleBoundary);
}
List<FeatureResult<?>> featureResults = new ArrayList<>();
for (SentenceDetectorFeature<?> feature : features) {
RuntimeEnvironment env = new RuntimeEnvironment();
FeatureResult<?> featureResult = feature.check(boundary, env);
if (featureResult != null)
featureResults.add(featureResult);
}
if (LOG.isTraceEnabled()) {
SortedSet<String> featureResultSet = featureResults.stream().map(f -> f.toString()).collect(Collectors.toCollection(() -> new TreeSet<String>()));
for (String featureResultString : featureResultSet) {
LOG.trace(featureResultString);
}
}
List<Decision> decisions = this.decisionMaker.decide(featureResults);
if (LOG.isTraceEnabled()) {
for (Decision decision : decisions) {
LOG.trace(decision.getOutcome() + ": " + decision.getProbability());
}
}
if (decisions.get(0).getOutcome().equals(SentenceDetectorOutcome.IS_BOUNDARY.name())) {
if (LOG.isTraceEnabled()) {
LOG.trace("Adding boundary: " + possibleBoundary + 1);
}
guessedBoundaries.add(possibleBoundary + 1);
boundaries.add(boundary);
}
}
if (LOG.isTraceEnabled()) {
LOG.trace("context: " + text.getText().toString().replace('\n', '¶').replace('\r', '¶'));
for (PossibleSentenceBoundary boundary : boundaries) LOG.trace("boundary: " + boundary.toString());
}
if (LOG.isDebugEnabled())
LOG.debug("guessedBoundaries : " + guessedBoundaries.toString());
List<Annotation<SentenceBoundary>> newBoundaries = new ArrayList<>();
int lastBoundary = 0;
List<Annotation<SentenceBoundary>> existingBoundaries = text.getAnnotations(SentenceBoundary.class);
if (existingBoundaries.size() > 0) {
lastBoundary = existingBoundaries.get(existingBoundaries.size() - 1).getEnd();
}
// advance boundary start until a non space character is encountered
while (lastBoundary < text.getAnalysisEnd() && Character.isWhitespace(text.getText().charAt(lastBoundary))) {
lastBoundary++;
}
for (int guessedBoundary : guessedBoundaries) {
if (guessedBoundary > lastBoundary) {
Annotation<SentenceBoundary> sentenceBoundary = new Annotation<>(lastBoundary, guessedBoundary, new SentenceBoundary(), labels);
newBoundaries.add(sentenceBoundary);
if (LOG.isTraceEnabled()) {
LOG.trace("Added boundary: " + sentenceBoundary);
}
lastBoundary = guessedBoundary;
}
}
if (text.getAnalysisEnd() == text.getText().length()) {
if (text.getAnalysisEnd() > lastBoundary) {
Annotation<SentenceBoundary> sentenceBoundary = new Annotation<>(lastBoundary, text.getAnalysisEnd(), new SentenceBoundary(), labels);
newBoundaries.add(sentenceBoundary);
if (LOG.isTraceEnabled()) {
LOG.trace("Added final boundary: " + sentenceBoundary);
}
}
}
text.addAnnotations(newBoundaries);
return new ArrayList<>(guessedBoundaries);
}
use of com.joliciel.talismane.AnnotatedText in project talismane by joliciel-informatique.
the class SentenceDetectorEvaluator method evaluate.
/**
* Evaluate a given sentence detector.
*
* @return an f-score calculator for this sentence detector
* @throws TalismaneException
* @throws IOException
*/
public FScoreCalculator<SentenceDetectorOutcome> evaluate() throws TalismaneException, IOException {
FScoreCalculator<SentenceDetectorOutcome> fScoreCalculator = new FScoreCalculator<SentenceDetectorOutcome>();
// add f-score per tagger module, to see how we do for each boundary
// character
Map<String, FScoreCalculator<SentenceDetectorOutcome>> taggerFScoreCalculators = new TreeMap<String, FScoreCalculator<SentenceDetectorOutcome>>();
Map<String, List<String>> errorMap = new TreeMap<String, List<String>>();
LinkedList<String> sentences = new LinkedList<String>();
String sentence = null;
String previousSentence = ". ";
if (corpusReader.hasNextSentence())
sentence = corpusReader.nextSentence().getText().toString();
sentences.add(sentence);
while (!sentences.isEmpty()) {
sentence = sentences.poll();
LOG.debug("Sentence: " + sentence);
String moreText = "";
int sentenceIndex = 0;
while (moreText.length() < minCharactersAfterBoundary) {
String nextSentence = "";
if (sentenceIndex < sentences.size()) {
nextSentence = sentences.get(sentenceIndex);
} else if (corpusReader.hasNextSentence()) {
nextSentence = corpusReader.nextSentence().getText().toString();
sentences.add(nextSentence);
} else {
break;
}
if (nextSentence.startsWith(" ") || nextSentence.startsWith("\n"))
moreText += nextSentence;
else
moreText += " " + nextSentence;
sentenceIndex++;
}
String text = previousSentence + sentence + moreText;
AnnotatedText annotatedText = new AnnotatedText(text, previousSentence.length(), previousSentence.length() + sentence.length(), new ArrayList<>());
Matcher matcher = sentenceDetector.getPossibleBoundaryPattern().matcher(text);
List<Integer> possibleBoundaries = new ArrayList<Integer>();
while (matcher.find()) {
if (matcher.start() >= annotatedText.getAnalysisStart() && matcher.start() < annotatedText.getAnalysisEnd())
possibleBoundaries.add(matcher.start());
}
int realBoundary = previousSentence.length() + sentence.length();
if (!possibleBoundaries.contains(realBoundary))
possibleBoundaries.add(realBoundary);
List<Integer> guessedBoundaries = this.sentenceDetector.detectSentences(annotatedText);
for (int possibleBoundary : possibleBoundaries) {
SentenceDetectorOutcome expected = SentenceDetectorOutcome.IS_NOT_BOUNDARY;
SentenceDetectorOutcome guessed = SentenceDetectorOutcome.IS_NOT_BOUNDARY;
if (possibleBoundary == realBoundary)
expected = SentenceDetectorOutcome.IS_BOUNDARY;
if (guessedBoundaries.contains(possibleBoundary))
guessed = SentenceDetectorOutcome.IS_BOUNDARY;
fScoreCalculator.increment(expected, guessed);
String boundaryCharacter = "" + text.charAt(possibleBoundary - 1);
Matcher boundaryMatcher = sentenceDetector.getPossibleBoundaryPattern().matcher(boundaryCharacter);
if (!boundaryMatcher.matches())
boundaryCharacter = "OTHER";
FScoreCalculator<SentenceDetectorOutcome> taggerFScoreCalculator = taggerFScoreCalculators.get(boundaryCharacter);
if (taggerFScoreCalculator == null) {
taggerFScoreCalculator = new FScoreCalculator<SentenceDetectorOutcome>();
taggerFScoreCalculators.put(boundaryCharacter, taggerFScoreCalculator);
}
taggerFScoreCalculator.increment(expected, guessed);
if (!expected.equals(guessed)) {
int start1 = possibleBoundary - NUM_CHARS;
int end1 = possibleBoundary + NUM_CHARS;
if (start1 < 0)
start1 = 0;
String startString = text.substring(start1, possibleBoundary - 1);
startString = StringUtils.padLeft(startString, NUM_CHARS);
String middleString = "" + text.charAt(possibleBoundary - 1);
if (end1 >= text.length())
end1 = text.length() - 1;
String endString = "";
if (end1 >= 0 && possibleBoundary < text.length())
endString = text.substring(possibleBoundary, end1);
String testText = startString + "[" + middleString + "]" + endString;
testText = testText.replace('\n', '¶');
String error = "Guessed " + guessed + ", Expected " + expected + ". Text: " + testText;
LOG.debug(error);
List<String> errors = errorMap.get(boundaryCharacter);
if (errors == null) {
errors = new ArrayList<String>();
errorMap.put(boundaryCharacter, errors);
}
errors.add(error);
}
// have error
}
// next possible boundary
if (sentence.endsWith(" "))
previousSentence = sentence;
else
previousSentence = sentence + " ";
}
for (String boundaryCharacter : taggerFScoreCalculators.keySet()) {
LOG.debug("###### Boundary " + boundaryCharacter);
FScoreCalculator<SentenceDetectorOutcome> taggerFScoreCalculator = taggerFScoreCalculators.get(boundaryCharacter);
LOG.debug("###### Boundary " + boundaryCharacter + ": f-score = " + taggerFScoreCalculator.getTotalFScore());
}
if (errorWriter != null) {
for (String boundaryCharacter : taggerFScoreCalculators.keySet()) {
FScoreCalculator<SentenceDetectorOutcome> taggerFScoreCalculator = taggerFScoreCalculators.get(boundaryCharacter);
errorWriter.write("###### Tagger " + boundaryCharacter + ": f-score = " + taggerFScoreCalculator.getTotalFScore() + "\n");
errorWriter.write("Total " + (taggerFScoreCalculator.getTotalTruePositiveCount() + taggerFScoreCalculator.getTotalFalseNegativeCount()) + "\n");
errorWriter.write("True + " + taggerFScoreCalculator.getTotalTruePositiveCount() + "\n");
errorWriter.write("False- " + taggerFScoreCalculator.getTotalFalseNegativeCount() + "\n");
errorWriter.write("False+ " + taggerFScoreCalculator.getTotalFalsePositiveCount() + "\n");
for (SentenceDetectorOutcome outcome : taggerFScoreCalculator.getOutcomeSet()) {
errorWriter.write(outcome + " total " + (taggerFScoreCalculator.getTruePositiveCount(outcome) + taggerFScoreCalculator.getFalseNegativeCount(outcome)) + "\n");
errorWriter.write(outcome + " true + " + (taggerFScoreCalculator.getTruePositiveCount(outcome)) + "\n");
errorWriter.write(outcome + " false- " + (taggerFScoreCalculator.getFalseNegativeCount(outcome)) + "\n");
errorWriter.write(outcome + " false+ " + (taggerFScoreCalculator.getFalsePositiveCount(outcome)) + "\n");
errorWriter.write(outcome + " precis " + (taggerFScoreCalculator.getPrecision(outcome)) + "\n");
errorWriter.write(outcome + " recall " + (taggerFScoreCalculator.getRecall(outcome)) + "\n");
errorWriter.write(outcome + " fscore " + (taggerFScoreCalculator.getFScore(outcome)) + "\n");
}
List<String> errors = errorMap.get(boundaryCharacter);
if (errors != null) {
for (String error : errors) {
errorWriter.write(error + "\n");
}
}
errorWriter.flush();
}
// next boundary character
errorWriter.close();
}
// have error writer
return fScoreCalculator;
}
use of com.joliciel.talismane.AnnotatedText in project talismane by joliciel-informatique.
the class RawTextProcessor method getDetectedSentences.
/**
* Get a list of sentences currently detected. All sentences will be complete
* - if the list ends with an incomplete sentence it is kept for another
* round.
*
* @return
*/
public final List<Sentence> getDetectedSentences() {
SentenceHolder prevHolder = this.getPreviousSentenceHolder();
SentenceHolder currentHolder = this.getCurrentSentenceHolder();
for (Annotation<SentenceBoundary> sentenceBoundary : sentenceBoundaries) {
currentHolder.addSentenceBoundary(sentenceBoundary.getStart() - prevHolder.getProcessedText().length());
currentHolder.addSentenceBoundary(sentenceBoundary.getEnd() - prevHolder.getProcessedText().length());
}
List<Sentence> sentences = currentHolder.getDetectedSentences(leftover);
leftover = null;
if (sentences.size() > 0) {
Sentence lastSentence = sentences.get(sentences.size() - 1);
if (!lastSentence.isComplete()) {
leftover = lastSentence;
if (LOG.isTraceEnabled())
LOG.trace("Set leftover to: " + leftover.toString());
sentences.remove(sentences.size() - 1);
}
}
// ensure that sentence annotations get added to the raw text as well
for (Sentence sentence : sentences) {
sentence.addObserver(new AnnotationObserver() {
int myOrigin = RawTextProcessor.this.originalStartIndex;
@Override
public <T extends Serializable> void beforeAddAnnotations(AnnotatedText subject, List<Annotation<T>> annotations) {
List<Annotation<T>> newAnnotations = new ArrayList<>();
for (Annotation<T> annotation : annotations) {
int originalStart = sentence.getOriginalIndex(annotation.getStart());
int originalEnd = sentence.getOriginalIndex(annotation.getEnd());
Annotation<T> newAnnotation = annotation.getAnnotation(originalStart - myOrigin, originalEnd - myOrigin);
newAnnotations.add(newAnnotation);
}
RawTextProcessor.this.addAnnotations(newAnnotations);
}
@Override
public <T extends Serializable> void afterAddAnnotations(AnnotatedText subject) {
}
});
}
// position 0.
if (currentHolder.getOriginalTextSegments().size() > 0) {
if (leftover == null) {
leftover = new Sentence("", currentFile, sessionId);
}
StringBuilder segmentsToInsert = new StringBuilder();
if (leftover.getLeftoverOriginalText().length() > 0)
segmentsToInsert.append(TalismaneSession.get(sessionId).getOutputDivider());
for (String originalTextSegment : currentHolder.getOriginalTextSegments().values()) {
segmentsToInsert.append(originalTextSegment);
}
leftover.setLeftoverOriginalText(leftover.getLeftoverOriginalText() + segmentsToInsert.toString());
}
return sentences;
}
use of com.joliciel.talismane.AnnotatedText in project talismane by joliciel-informatique.
the class RollingTextBlock method getRawTextBlock.
/**
* Get a raw text block for annotation by filters. This covers blocks 3 and 4
* only of the current RollingTextBlock, with analysis end at the end of
* block3. It is assumed that annotations crossing block 2 and 3 were already
* added by a predecessor.
*/
public AnnotatedText getRawTextBlock() {
AnnotatedText rawTextBlock = new AnnotatedText(this.block3 + this.block4, 0, this.block3.length());
rawTextBlock.addObserver(new AnnotationObserver() {
@Override
public <T extends Serializable> void beforeAddAnnotations(AnnotatedText subject, List<Annotation<T>> annotations) {
if (annotations.size() > 0) {
int offset = RollingTextBlock.this.block1.length() + RollingTextBlock.this.block2.length();
List<Annotation<T>> newAnnotations = new ArrayList<>();
for (Annotation<T> annotation : annotations) {
Annotation<T> newAnnotation = annotation.getAnnotation(annotation.getStart() + offset, annotation.getEnd() + offset);
newAnnotations.add(newAnnotation);
}
RollingTextBlock.this.addAnnotations(newAnnotations);
if (LOG.isTraceEnabled()) {
LOG.trace("RawTextBlock Annotations received: " + annotations);
LOG.trace("RawTextBlock Annotations added: " + newAnnotations);
}
}
}
@Override
public <T extends Serializable> void afterAddAnnotations(AnnotatedText subject) {
}
});
return rawTextBlock;
}
use of com.joliciel.talismane.AnnotatedText in project talismane by joliciel-informatique.
the class NewlineEndOfSentenceMarkerTest method testApply.
@Test
public void testApply() throws Exception {
NewlineEndOfSentenceMarker filter = new NewlineEndOfSentenceMarker(1000);
AnnotatedText text = new AnnotatedText("1\r\n2\r\n");
filter.annotate(text);
LOG.debug(text.getAnnotations().toString());
List<Annotation<RawTextSentenceBreakMarker>> sentenceBreaks = text.getAnnotations(RawTextSentenceBreakMarker.class);
assertEquals(2, sentenceBreaks.size());
List<Annotation<RawTextSkipMarker>> skips = text.getAnnotations(RawTextSkipMarker.class);
assertEquals(2, skips.size());
assertEquals(1, sentenceBreaks.get(0).getStart());
assertEquals(3, sentenceBreaks.get(0).getEnd());
assertEquals(1, skips.get(0).getStart());
assertEquals(3, skips.get(0).getEnd());
assertEquals(4, sentenceBreaks.get(1).getStart());
assertEquals(6, sentenceBreaks.get(1).getEnd());
assertEquals(4, skips.get(1).getStart());
assertEquals(6, skips.get(1).getEnd());
text = new AnnotatedText("1\r2\r");
filter.annotate(text);
LOG.debug(text.getAnnotations().toString());
sentenceBreaks = text.getAnnotations(RawTextSentenceBreakMarker.class);
assertEquals(2, sentenceBreaks.size());
skips = text.getAnnotations(RawTextSkipMarker.class);
assertEquals(2, skips.size());
assertEquals(1, sentenceBreaks.get(0).getStart());
assertEquals(2, sentenceBreaks.get(0).getEnd());
assertEquals(1, skips.get(0).getStart());
assertEquals(2, skips.get(0).getEnd());
assertEquals(3, sentenceBreaks.get(1).getStart());
assertEquals(4, sentenceBreaks.get(1).getEnd());
assertEquals(3, skips.get(1).getStart());
assertEquals(4, skips.get(1).getEnd());
text = new AnnotatedText("1\r2\r");
filter.annotate(text);
LOG.debug(text.getAnnotations().toString());
sentenceBreaks = text.getAnnotations(RawTextSentenceBreakMarker.class);
assertEquals(2, sentenceBreaks.size());
skips = text.getAnnotations(RawTextSkipMarker.class);
assertEquals(2, skips.size());
assertEquals(1, sentenceBreaks.get(0).getStart());
assertEquals(2, sentenceBreaks.get(0).getEnd());
assertEquals(1, skips.get(0).getStart());
assertEquals(2, skips.get(0).getEnd());
assertEquals(3, sentenceBreaks.get(1).getStart());
assertEquals(4, sentenceBreaks.get(1).getEnd());
assertEquals(3, skips.get(1).getStart());
assertEquals(4, skips.get(1).getEnd());
}
Aggregations