Search in sources :

Example 1 with FScoreCalculator

use of com.joliciel.talismane.stats.FScoreCalculator in project talismane by joliciel-informatique.

the class SentenceDetectorEvaluator method evaluate.

/**
 * Evaluate a given sentence detector.
 *
 * @return an f-score calculator for this sentence detector
 * @throws TalismaneException
 * @throws IOException
 */
public FScoreCalculator<SentenceDetectorOutcome> evaluate() throws TalismaneException, IOException {
    FScoreCalculator<SentenceDetectorOutcome> fScoreCalculator = new FScoreCalculator<SentenceDetectorOutcome>();
    // add f-score per tagger module, to see how we do for each boundary
    // character
    Map<String, FScoreCalculator<SentenceDetectorOutcome>> taggerFScoreCalculators = new TreeMap<String, FScoreCalculator<SentenceDetectorOutcome>>();
    Map<String, List<String>> errorMap = new TreeMap<String, List<String>>();
    LinkedList<String> sentences = new LinkedList<String>();
    String sentence = null;
    String previousSentence = ". ";
    if (corpusReader.hasNextSentence())
        sentence = corpusReader.nextSentence().getText().toString();
    sentences.add(sentence);
    while (!sentences.isEmpty()) {
        sentence = sentences.poll();
        LOG.debug("Sentence: " + sentence);
        String moreText = "";
        int sentenceIndex = 0;
        while (moreText.length() < minCharactersAfterBoundary) {
            String nextSentence = "";
            if (sentenceIndex < sentences.size()) {
                nextSentence = sentences.get(sentenceIndex);
            } else if (corpusReader.hasNextSentence()) {
                nextSentence = corpusReader.nextSentence().getText().toString();
                sentences.add(nextSentence);
            } else {
                break;
            }
            if (nextSentence.startsWith(" ") || nextSentence.startsWith("\n"))
                moreText += nextSentence;
            else
                moreText += " " + nextSentence;
            sentenceIndex++;
        }
        String text = previousSentence + sentence + moreText;
        AnnotatedText annotatedText = new AnnotatedText(text, previousSentence.length(), previousSentence.length() + sentence.length(), new ArrayList<>());
        Matcher matcher = sentenceDetector.getPossibleBoundaryPattern().matcher(text);
        List<Integer> possibleBoundaries = new ArrayList<Integer>();
        while (matcher.find()) {
            if (matcher.start() >= annotatedText.getAnalysisStart() && matcher.start() < annotatedText.getAnalysisEnd())
                possibleBoundaries.add(matcher.start());
        }
        int realBoundary = previousSentence.length() + sentence.length();
        if (!possibleBoundaries.contains(realBoundary))
            possibleBoundaries.add(realBoundary);
        List<Integer> guessedBoundaries = this.sentenceDetector.detectSentences(annotatedText);
        for (int possibleBoundary : possibleBoundaries) {
            SentenceDetectorOutcome expected = SentenceDetectorOutcome.IS_NOT_BOUNDARY;
            SentenceDetectorOutcome guessed = SentenceDetectorOutcome.IS_NOT_BOUNDARY;
            if (possibleBoundary == realBoundary)
                expected = SentenceDetectorOutcome.IS_BOUNDARY;
            if (guessedBoundaries.contains(possibleBoundary))
                guessed = SentenceDetectorOutcome.IS_BOUNDARY;
            fScoreCalculator.increment(expected, guessed);
            String boundaryCharacter = "" + text.charAt(possibleBoundary - 1);
            Matcher boundaryMatcher = sentenceDetector.getPossibleBoundaryPattern().matcher(boundaryCharacter);
            if (!boundaryMatcher.matches())
                boundaryCharacter = "OTHER";
            FScoreCalculator<SentenceDetectorOutcome> taggerFScoreCalculator = taggerFScoreCalculators.get(boundaryCharacter);
            if (taggerFScoreCalculator == null) {
                taggerFScoreCalculator = new FScoreCalculator<SentenceDetectorOutcome>();
                taggerFScoreCalculators.put(boundaryCharacter, taggerFScoreCalculator);
            }
            taggerFScoreCalculator.increment(expected, guessed);
            if (!expected.equals(guessed)) {
                int start1 = possibleBoundary - NUM_CHARS;
                int end1 = possibleBoundary + NUM_CHARS;
                if (start1 < 0)
                    start1 = 0;
                String startString = text.substring(start1, possibleBoundary - 1);
                startString = StringUtils.padLeft(startString, NUM_CHARS);
                String middleString = "" + text.charAt(possibleBoundary - 1);
                if (end1 >= text.length())
                    end1 = text.length() - 1;
                String endString = "";
                if (end1 >= 0 && possibleBoundary < text.length())
                    endString = text.substring(possibleBoundary, end1);
                String testText = startString + "[" + middleString + "]" + endString;
                testText = testText.replace('\n', 'ΒΆ');
                String error = "Guessed " + guessed + ", Expected " + expected + ". Text: " + testText;
                LOG.debug(error);
                List<String> errors = errorMap.get(boundaryCharacter);
                if (errors == null) {
                    errors = new ArrayList<String>();
                    errorMap.put(boundaryCharacter, errors);
                }
                errors.add(error);
            }
        // have error
        }
        // next possible boundary
        if (sentence.endsWith(" "))
            previousSentence = sentence;
        else
            previousSentence = sentence + " ";
    }
    for (String boundaryCharacter : taggerFScoreCalculators.keySet()) {
        LOG.debug("###### Boundary " + boundaryCharacter);
        FScoreCalculator<SentenceDetectorOutcome> taggerFScoreCalculator = taggerFScoreCalculators.get(boundaryCharacter);
        LOG.debug("###### Boundary " + boundaryCharacter + ": f-score = " + taggerFScoreCalculator.getTotalFScore());
    }
    if (errorWriter != null) {
        for (String boundaryCharacter : taggerFScoreCalculators.keySet()) {
            FScoreCalculator<SentenceDetectorOutcome> taggerFScoreCalculator = taggerFScoreCalculators.get(boundaryCharacter);
            errorWriter.write("###### Tagger " + boundaryCharacter + ": f-score = " + taggerFScoreCalculator.getTotalFScore() + "\n");
            errorWriter.write("Total " + (taggerFScoreCalculator.getTotalTruePositiveCount() + taggerFScoreCalculator.getTotalFalseNegativeCount()) + "\n");
            errorWriter.write("True + " + taggerFScoreCalculator.getTotalTruePositiveCount() + "\n");
            errorWriter.write("False- " + taggerFScoreCalculator.getTotalFalseNegativeCount() + "\n");
            errorWriter.write("False+ " + taggerFScoreCalculator.getTotalFalsePositiveCount() + "\n");
            for (SentenceDetectorOutcome outcome : taggerFScoreCalculator.getOutcomeSet()) {
                errorWriter.write(outcome + " total  " + (taggerFScoreCalculator.getTruePositiveCount(outcome) + taggerFScoreCalculator.getFalseNegativeCount(outcome)) + "\n");
                errorWriter.write(outcome + " true + " + (taggerFScoreCalculator.getTruePositiveCount(outcome)) + "\n");
                errorWriter.write(outcome + " false- " + (taggerFScoreCalculator.getFalseNegativeCount(outcome)) + "\n");
                errorWriter.write(outcome + " false+ " + (taggerFScoreCalculator.getFalsePositiveCount(outcome)) + "\n");
                errorWriter.write(outcome + " precis " + (taggerFScoreCalculator.getPrecision(outcome)) + "\n");
                errorWriter.write(outcome + " recall " + (taggerFScoreCalculator.getRecall(outcome)) + "\n");
                errorWriter.write(outcome + " fscore " + (taggerFScoreCalculator.getFScore(outcome)) + "\n");
            }
            List<String> errors = errorMap.get(boundaryCharacter);
            if (errors != null) {
                for (String error : errors) {
                    errorWriter.write(error + "\n");
                }
            }
            errorWriter.flush();
        }
        // next boundary character
        errorWriter.close();
    }
    // have error writer
    return fScoreCalculator;
}
Also used : AnnotatedText(com.joliciel.talismane.AnnotatedText) Matcher(java.util.regex.Matcher) FScoreCalculator(com.joliciel.talismane.stats.FScoreCalculator) ArrayList(java.util.ArrayList) TreeMap(java.util.TreeMap) LinkedList(java.util.LinkedList) ArrayList(java.util.ArrayList) List(java.util.List) LinkedList(java.util.LinkedList)

Aggregations

AnnotatedText (com.joliciel.talismane.AnnotatedText)1 FScoreCalculator (com.joliciel.talismane.stats.FScoreCalculator)1 ArrayList (java.util.ArrayList)1 LinkedList (java.util.LinkedList)1 List (java.util.List)1 TreeMap (java.util.TreeMap)1 Matcher (java.util.regex.Matcher)1