Search in sources :

Example 6 with TokenisedAtomicTokenSequence

use of com.joliciel.talismane.tokeniser.TokenisedAtomicTokenSequence in project talismane by joliciel-informatique.

the class TokenComparator method compare.

/**
 * Evaluate the evaluation corpus against the reference corpus.
 *
 * @throws TalismaneException
 * @throws IOException
 */
public void compare() throws TalismaneException, IOException {
    while (referenceCorpusReader.hasNextSentence()) {
        TokenSequence realSequence = referenceCorpusReader.nextTokenSequence();
        TokenSequence guessedSequence = null;
        if (evaluationCorpusReader.hasNextSentence())
            guessedSequence = evaluationCorpusReader.nextTokenSequence();
        else {
            throw new TalismaneException("Wrong number of sentences in eval corpus: " + realSequence.getSentence().getText());
        }
        Sentence sentence = realSequence.getSentence();
        // Initially, separate the sentence into tokens using the separators
        // provided
        TokenSequence realAtomicSequence = new TokenSequence(sentence, sessionId);
        realAtomicSequence.findDefaultTokens();
        TokenSequence guessedAtomicSequence = new TokenSequence(guessedSequence.getSentence(), sessionId);
        guessedAtomicSequence.findDefaultTokens();
        List<TokenPatternMatchSequence> matchingSequences = new ArrayList<TokenPatternMatchSequence>();
        Map<Token, Set<TokenPatternMatchSequence>> tokenMatchSequenceMap = new HashMap<Token, Set<TokenPatternMatchSequence>>();
        Set<Token> matchedTokens = new HashSet<Token>();
        for (TokenPattern parsedPattern : tokeniserPatternManager.getParsedTestPatterns()) {
            List<TokenPatternMatchSequence> matchesForThisPattern = parsedPattern.match(realAtomicSequence);
            for (TokenPatternMatchSequence matchSequence : matchesForThisPattern) {
                matchingSequences.add(matchSequence);
                matchedTokens.addAll(matchSequence.getTokensToCheck());
                Token token = null;
                for (Token aToken : matchSequence.getTokensToCheck()) {
                    token = aToken;
                    if (!aToken.isWhiteSpace()) {
                        break;
                    }
                }
                Set<TokenPatternMatchSequence> matchSequences = tokenMatchSequenceMap.get(token);
                if (matchSequences == null) {
                    matchSequences = new TreeSet<TokenPatternMatchSequence>();
                    tokenMatchSequenceMap.put(token, matchSequences);
                }
                matchSequences.add(matchSequence);
            }
        }
        TokenisedAtomicTokenSequence guess = new TokenisedAtomicTokenSequence(realSequence.getSentence(), 0, sessionId);
        int i = 0;
        int mismatches = 0;
        for (Token token : realAtomicSequence) {
            if (!token.getText().equals(guessedAtomicSequence.get(i).getToken().getText())) {
                // skipped stuff at start of sentence on guess, if it's been
                // through the parser
                TokeniserOutcome outcome = TokeniserOutcome.SEPARATE;
                Decision decision = new Decision(outcome.name());
                decision.addAuthority("_" + this.getClass().getSimpleName());
                Set<TokenPatternMatchSequence> matchSequences = tokenMatchSequenceMap.get(token);
                if (matchSequences != null) {
                    decision.addAuthority("_Patterns");
                    for (TokenPatternMatchSequence matchSequence : matchSequences) {
                        decision.addAuthority(matchSequence.getTokenPattern().getName());
                    }
                }
                guess.addTaggedToken(token, decision, outcome);
                mismatches++;
                LOG.debug("Mismatch: '" + token.getText() + "', '" + guessedAtomicSequence.get(i).getToken().getText() + "'");
                if (mismatches > 6) {
                    LOG.info("Real sequence: " + realSequence.getSentence().getText());
                    LOG.info("Guessed sequence: " + guessedSequence.getSentence().getText());
                    throw new TalismaneException("Too many mismatches for sentence: " + realSequence.getSentence().getText());
                }
                continue;
            }
            TokeniserOutcome outcome = TokeniserOutcome.JOIN;
            if (guessedSequence.getTokenSplits().contains(guessedAtomicSequence.get(i).getToken().getStartIndex())) {
                outcome = TokeniserOutcome.SEPARATE;
            }
            Decision decision = new Decision(outcome.name());
            decision.addAuthority("_" + this.getClass().getSimpleName());
            Set<TokenPatternMatchSequence> matchSequences = tokenMatchSequenceMap.get(token);
            if (matchSequences != null) {
                decision.addAuthority("_Patterns");
                for (TokenPatternMatchSequence matchSequence : matchSequences) {
                    decision.addAuthority(matchSequence.getTokenPattern().getName());
                }
            }
            guess.addTaggedToken(token, decision, outcome);
            i++;
        }
        List<TokenisedAtomicTokenSequence> guessedAtomicSequences = new ArrayList<TokenisedAtomicTokenSequence>();
        guessedAtomicSequences.add(guess);
        for (TokenEvaluationObserver observer : observers) {
            observer.onNextTokenSequence(realSequence, guessedAtomicSequences);
        }
    }
    for (TokenEvaluationObserver observer : observers) {
        observer.onEvaluationComplete();
    }
}
Also used : TreeSet(java.util.TreeSet) HashSet(java.util.HashSet) Set(java.util.Set) TalismaneException(com.joliciel.talismane.TalismaneException) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Token(com.joliciel.talismane.tokeniser.Token) TokeniserOutcome(com.joliciel.talismane.tokeniser.TokeniserOutcome) Decision(com.joliciel.talismane.machineLearning.Decision) TokenPattern(com.joliciel.talismane.tokeniser.patterns.TokenPattern) TokenPatternMatchSequence(com.joliciel.talismane.tokeniser.patterns.TokenPatternMatchSequence) TokenisedAtomicTokenSequence(com.joliciel.talismane.tokeniser.TokenisedAtomicTokenSequence) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) Sentence(com.joliciel.talismane.rawText.Sentence) HashSet(java.util.HashSet) TokenisedAtomicTokenSequence(com.joliciel.talismane.tokeniser.TokenisedAtomicTokenSequence)

Aggregations

TokenisedAtomicTokenSequence (com.joliciel.talismane.tokeniser.TokenisedAtomicTokenSequence)6 TokeniserOutcome (com.joliciel.talismane.tokeniser.TokeniserOutcome)5 Token (com.joliciel.talismane.tokeniser.Token)4 TokenSequence (com.joliciel.talismane.tokeniser.TokenSequence)4 Sentence (com.joliciel.talismane.rawText.Sentence)3 TaggedToken (com.joliciel.talismane.tokeniser.TaggedToken)3 ArrayList (java.util.ArrayList)3 HashMap (java.util.HashMap)3 TreeSet (java.util.TreeSet)3 TalismaneException (com.joliciel.talismane.TalismaneException)2 Decision (com.joliciel.talismane.machineLearning.Decision)2 HashSet (java.util.HashSet)2 List (java.util.List)2 Set (java.util.Set)2 TalismaneSession (com.joliciel.talismane.TalismaneSession)1 ClassificationModel (com.joliciel.talismane.machineLearning.ClassificationModel)1 ClassificationObserver (com.joliciel.talismane.machineLearning.ClassificationObserver)1 DecisionMaker (com.joliciel.talismane.machineLearning.DecisionMaker)1 ExternalResource (com.joliciel.talismane.machineLearning.ExternalResource)1 MachineLearningModelFactory (com.joliciel.talismane.machineLearning.MachineLearningModelFactory)1