use of com.joliciel.talismane.stats.FScoreCalculator in project talismane by joliciel-informatique.
the class SentenceDetectorEvaluator method evaluate.
/**
* Evaluate a given sentence detector.
*
* @return an f-score calculator for this sentence detector
* @throws TalismaneException
* @throws IOException
*/
public FScoreCalculator<SentenceDetectorOutcome> evaluate() throws TalismaneException, IOException {
FScoreCalculator<SentenceDetectorOutcome> fScoreCalculator = new FScoreCalculator<SentenceDetectorOutcome>();
// add f-score per tagger module, to see how we do for each boundary
// character
Map<String, FScoreCalculator<SentenceDetectorOutcome>> taggerFScoreCalculators = new TreeMap<String, FScoreCalculator<SentenceDetectorOutcome>>();
Map<String, List<String>> errorMap = new TreeMap<String, List<String>>();
LinkedList<String> sentences = new LinkedList<String>();
String sentence = null;
String previousSentence = ". ";
if (corpusReader.hasNextSentence())
sentence = corpusReader.nextSentence().getText().toString();
sentences.add(sentence);
while (!sentences.isEmpty()) {
sentence = sentences.poll();
LOG.debug("Sentence: " + sentence);
String moreText = "";
int sentenceIndex = 0;
while (moreText.length() < minCharactersAfterBoundary) {
String nextSentence = "";
if (sentenceIndex < sentences.size()) {
nextSentence = sentences.get(sentenceIndex);
} else if (corpusReader.hasNextSentence()) {
nextSentence = corpusReader.nextSentence().getText().toString();
sentences.add(nextSentence);
} else {
break;
}
if (nextSentence.startsWith(" ") || nextSentence.startsWith("\n"))
moreText += nextSentence;
else
moreText += " " + nextSentence;
sentenceIndex++;
}
String text = previousSentence + sentence + moreText;
AnnotatedText annotatedText = new AnnotatedText(text, previousSentence.length(), previousSentence.length() + sentence.length(), new ArrayList<>());
Matcher matcher = sentenceDetector.getPossibleBoundaryPattern().matcher(text);
List<Integer> possibleBoundaries = new ArrayList<Integer>();
while (matcher.find()) {
if (matcher.start() >= annotatedText.getAnalysisStart() && matcher.start() < annotatedText.getAnalysisEnd())
possibleBoundaries.add(matcher.start());
}
int realBoundary = previousSentence.length() + sentence.length();
if (!possibleBoundaries.contains(realBoundary))
possibleBoundaries.add(realBoundary);
List<Integer> guessedBoundaries = this.sentenceDetector.detectSentences(annotatedText);
for (int possibleBoundary : possibleBoundaries) {
SentenceDetectorOutcome expected = SentenceDetectorOutcome.IS_NOT_BOUNDARY;
SentenceDetectorOutcome guessed = SentenceDetectorOutcome.IS_NOT_BOUNDARY;
if (possibleBoundary == realBoundary)
expected = SentenceDetectorOutcome.IS_BOUNDARY;
if (guessedBoundaries.contains(possibleBoundary))
guessed = SentenceDetectorOutcome.IS_BOUNDARY;
fScoreCalculator.increment(expected, guessed);
String boundaryCharacter = "" + text.charAt(possibleBoundary - 1);
Matcher boundaryMatcher = sentenceDetector.getPossibleBoundaryPattern().matcher(boundaryCharacter);
if (!boundaryMatcher.matches())
boundaryCharacter = "OTHER";
FScoreCalculator<SentenceDetectorOutcome> taggerFScoreCalculator = taggerFScoreCalculators.get(boundaryCharacter);
if (taggerFScoreCalculator == null) {
taggerFScoreCalculator = new FScoreCalculator<SentenceDetectorOutcome>();
taggerFScoreCalculators.put(boundaryCharacter, taggerFScoreCalculator);
}
taggerFScoreCalculator.increment(expected, guessed);
if (!expected.equals(guessed)) {
int start1 = possibleBoundary - NUM_CHARS;
int end1 = possibleBoundary + NUM_CHARS;
if (start1 < 0)
start1 = 0;
String startString = text.substring(start1, possibleBoundary - 1);
startString = StringUtils.padLeft(startString, NUM_CHARS);
String middleString = "" + text.charAt(possibleBoundary - 1);
if (end1 >= text.length())
end1 = text.length() - 1;
String endString = "";
if (end1 >= 0 && possibleBoundary < text.length())
endString = text.substring(possibleBoundary, end1);
String testText = startString + "[" + middleString + "]" + endString;
testText = testText.replace('\n', 'ΒΆ');
String error = "Guessed " + guessed + ", Expected " + expected + ". Text: " + testText;
LOG.debug(error);
List<String> errors = errorMap.get(boundaryCharacter);
if (errors == null) {
errors = new ArrayList<String>();
errorMap.put(boundaryCharacter, errors);
}
errors.add(error);
}
// have error
}
// next possible boundary
if (sentence.endsWith(" "))
previousSentence = sentence;
else
previousSentence = sentence + " ";
}
for (String boundaryCharacter : taggerFScoreCalculators.keySet()) {
LOG.debug("###### Boundary " + boundaryCharacter);
FScoreCalculator<SentenceDetectorOutcome> taggerFScoreCalculator = taggerFScoreCalculators.get(boundaryCharacter);
LOG.debug("###### Boundary " + boundaryCharacter + ": f-score = " + taggerFScoreCalculator.getTotalFScore());
}
if (errorWriter != null) {
for (String boundaryCharacter : taggerFScoreCalculators.keySet()) {
FScoreCalculator<SentenceDetectorOutcome> taggerFScoreCalculator = taggerFScoreCalculators.get(boundaryCharacter);
errorWriter.write("###### Tagger " + boundaryCharacter + ": f-score = " + taggerFScoreCalculator.getTotalFScore() + "\n");
errorWriter.write("Total " + (taggerFScoreCalculator.getTotalTruePositiveCount() + taggerFScoreCalculator.getTotalFalseNegativeCount()) + "\n");
errorWriter.write("True + " + taggerFScoreCalculator.getTotalTruePositiveCount() + "\n");
errorWriter.write("False- " + taggerFScoreCalculator.getTotalFalseNegativeCount() + "\n");
errorWriter.write("False+ " + taggerFScoreCalculator.getTotalFalsePositiveCount() + "\n");
for (SentenceDetectorOutcome outcome : taggerFScoreCalculator.getOutcomeSet()) {
errorWriter.write(outcome + " total " + (taggerFScoreCalculator.getTruePositiveCount(outcome) + taggerFScoreCalculator.getFalseNegativeCount(outcome)) + "\n");
errorWriter.write(outcome + " true + " + (taggerFScoreCalculator.getTruePositiveCount(outcome)) + "\n");
errorWriter.write(outcome + " false- " + (taggerFScoreCalculator.getFalseNegativeCount(outcome)) + "\n");
errorWriter.write(outcome + " false+ " + (taggerFScoreCalculator.getFalsePositiveCount(outcome)) + "\n");
errorWriter.write(outcome + " precis " + (taggerFScoreCalculator.getPrecision(outcome)) + "\n");
errorWriter.write(outcome + " recall " + (taggerFScoreCalculator.getRecall(outcome)) + "\n");
errorWriter.write(outcome + " fscore " + (taggerFScoreCalculator.getFScore(outcome)) + "\n");
}
List<String> errors = errorMap.get(boundaryCharacter);
if (errors != null) {
for (String error : errors) {
errorWriter.write(error + "\n");
}
}
errorWriter.flush();
}
// next boundary character
errorWriter.close();
}
// have error writer
return fScoreCalculator;
}
Aggregations