Search in sources :

Example 41 with Sentence

use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.

the class TokenComparator method compare.

/**
 * Evaluate the evaluation corpus against the reference corpus.
 *
 * @throws TalismaneException
 * @throws IOException
 */
public void compare() throws TalismaneException, IOException {
    while (referenceCorpusReader.hasNextSentence()) {
        TokenSequence realSequence = referenceCorpusReader.nextTokenSequence();
        TokenSequence guessedSequence = null;
        if (evaluationCorpusReader.hasNextSentence())
            guessedSequence = evaluationCorpusReader.nextTokenSequence();
        else {
            throw new TalismaneException("Wrong number of sentences in eval corpus: " + realSequence.getSentence().getText());
        }
        Sentence sentence = realSequence.getSentence();
        // Initially, separate the sentence into tokens using the separators
        // provided
        TokenSequence realAtomicSequence = new TokenSequence(sentence, sessionId);
        realAtomicSequence.findDefaultTokens();
        TokenSequence guessedAtomicSequence = new TokenSequence(guessedSequence.getSentence(), sessionId);
        guessedAtomicSequence.findDefaultTokens();
        List<TokenPatternMatchSequence> matchingSequences = new ArrayList<TokenPatternMatchSequence>();
        Map<Token, Set<TokenPatternMatchSequence>> tokenMatchSequenceMap = new HashMap<Token, Set<TokenPatternMatchSequence>>();
        Set<Token> matchedTokens = new HashSet<Token>();
        for (TokenPattern parsedPattern : tokeniserPatternManager.getParsedTestPatterns()) {
            List<TokenPatternMatchSequence> matchesForThisPattern = parsedPattern.match(realAtomicSequence);
            for (TokenPatternMatchSequence matchSequence : matchesForThisPattern) {
                matchingSequences.add(matchSequence);
                matchedTokens.addAll(matchSequence.getTokensToCheck());
                Token token = null;
                for (Token aToken : matchSequence.getTokensToCheck()) {
                    token = aToken;
                    if (!aToken.isWhiteSpace()) {
                        break;
                    }
                }
                Set<TokenPatternMatchSequence> matchSequences = tokenMatchSequenceMap.get(token);
                if (matchSequences == null) {
                    matchSequences = new TreeSet<TokenPatternMatchSequence>();
                    tokenMatchSequenceMap.put(token, matchSequences);
                }
                matchSequences.add(matchSequence);
            }
        }
        TokenisedAtomicTokenSequence guess = new TokenisedAtomicTokenSequence(realSequence.getSentence(), 0, sessionId);
        int i = 0;
        int mismatches = 0;
        for (Token token : realAtomicSequence) {
            if (!token.getText().equals(guessedAtomicSequence.get(i).getToken().getText())) {
                // skipped stuff at start of sentence on guess, if it's been
                // through the parser
                TokeniserOutcome outcome = TokeniserOutcome.SEPARATE;
                Decision decision = new Decision(outcome.name());
                decision.addAuthority("_" + this.getClass().getSimpleName());
                Set<TokenPatternMatchSequence> matchSequences = tokenMatchSequenceMap.get(token);
                if (matchSequences != null) {
                    decision.addAuthority("_Patterns");
                    for (TokenPatternMatchSequence matchSequence : matchSequences) {
                        decision.addAuthority(matchSequence.getTokenPattern().getName());
                    }
                }
                guess.addTaggedToken(token, decision, outcome);
                mismatches++;
                LOG.debug("Mismatch: '" + token.getText() + "', '" + guessedAtomicSequence.get(i).getToken().getText() + "'");
                if (mismatches > 6) {
                    LOG.info("Real sequence: " + realSequence.getSentence().getText());
                    LOG.info("Guessed sequence: " + guessedSequence.getSentence().getText());
                    throw new TalismaneException("Too many mismatches for sentence: " + realSequence.getSentence().getText());
                }
                continue;
            }
            TokeniserOutcome outcome = TokeniserOutcome.JOIN;
            if (guessedSequence.getTokenSplits().contains(guessedAtomicSequence.get(i).getToken().getStartIndex())) {
                outcome = TokeniserOutcome.SEPARATE;
            }
            Decision decision = new Decision(outcome.name());
            decision.addAuthority("_" + this.getClass().getSimpleName());
            Set<TokenPatternMatchSequence> matchSequences = tokenMatchSequenceMap.get(token);
            if (matchSequences != null) {
                decision.addAuthority("_Patterns");
                for (TokenPatternMatchSequence matchSequence : matchSequences) {
                    decision.addAuthority(matchSequence.getTokenPattern().getName());
                }
            }
            guess.addTaggedToken(token, decision, outcome);
            i++;
        }
        List<TokenisedAtomicTokenSequence> guessedAtomicSequences = new ArrayList<TokenisedAtomicTokenSequence>();
        guessedAtomicSequences.add(guess);
        for (TokenEvaluationObserver observer : observers) {
            observer.onNextTokenSequence(realSequence, guessedAtomicSequences);
        }
    }
    for (TokenEvaluationObserver observer : observers) {
        observer.onEvaluationComplete();
    }
}
Also used : TreeSet(java.util.TreeSet) HashSet(java.util.HashSet) Set(java.util.Set) TalismaneException(com.joliciel.talismane.TalismaneException) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Token(com.joliciel.talismane.tokeniser.Token) TokeniserOutcome(com.joliciel.talismane.tokeniser.TokeniserOutcome) Decision(com.joliciel.talismane.machineLearning.Decision) TokenPattern(com.joliciel.talismane.tokeniser.patterns.TokenPattern) TokenPatternMatchSequence(com.joliciel.talismane.tokeniser.patterns.TokenPatternMatchSequence) TokenisedAtomicTokenSequence(com.joliciel.talismane.tokeniser.TokenisedAtomicTokenSequence) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) Sentence(com.joliciel.talismane.rawText.Sentence) HashSet(java.util.HashSet) TokenisedAtomicTokenSequence(com.joliciel.talismane.tokeniser.TokenisedAtomicTokenSequence)

Example 42 with Sentence

use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.

the class Token method getTrailingRawOutput.

/**
 * Any text that should be output as "raw" after outputting this token, or
 * null if none available. Will only ever return non-null for the final token
 * in a sequence.
 */
public String getTrailingRawOutput() {
    String rawOutput = null;
    if (this.index == this.getTokenSequence().size() - 1) {
        Sentence sentence = this.getTokenSequence().getSentence();
        rawOutput = sentence.getRawInput(startIndex, Integer.MAX_VALUE - 1);
        if (LOG.isTraceEnabled()) {
            if (rawOutput != null)
                LOG.trace("getTrailingRawOutput: " + rawOutput);
        }
    }
    return rawOutput;
}
Also used : Sentence(com.joliciel.talismane.rawText.Sentence)

Example 43 with Sentence

use of com.joliciel.talismane.rawText.Sentence in project talismane by joliciel-informatique.

the class Token method getPrecedingRawOutput.

/**
 * Any text that should be output as "raw" prior to outputting this token, or
 * null if none available.
 */
public String getPrecedingRawOutput() {
    Sentence sentence = this.getTokenSequence().getSentence();
    int prevStart = -1;
    if (this.index > 0) {
        Token previousToken = this.getTokenSequence().get(index - 1);
        prevStart = previousToken.getStartIndex();
    }
    String rawOutput = null;
    if (startIndex > prevStart)
        rawOutput = sentence.getRawInput(prevStart, startIndex);
    if (LOG.isTraceEnabled()) {
        if (rawOutput != null)
            LOG.trace("getPrecedingRawOutput: " + rawOutput);
    }
    return rawOutput;
}
Also used : Sentence(com.joliciel.talismane.rawText.Sentence)

Aggregations

Sentence (com.joliciel.talismane.rawText.Sentence)43 Config (com.typesafe.config.Config)31 TalismaneTest (com.joliciel.talismane.TalismaneTest)28 Test (org.junit.Test)28 TokenSequence (com.joliciel.talismane.tokeniser.TokenSequence)25 Token (com.joliciel.talismane.tokeniser.Token)14 PosTagSequence (com.joliciel.talismane.posTagger.PosTagSequence)13 Annotation (com.joliciel.talismane.Annotation)12 Decision (com.joliciel.talismane.machineLearning.Decision)11 ArrayList (java.util.ArrayList)9 PosTaggedToken (com.joliciel.talismane.posTagger.PosTaggedToken)8 RuntimeEnvironment (com.joliciel.talismane.machineLearning.features.RuntimeEnvironment)7 TalismaneException (com.joliciel.talismane.TalismaneException)6 HashMap (java.util.HashMap)6 List (java.util.List)6 StringLiteralFeature (com.joliciel.talismane.machineLearning.features.StringLiteralFeature)5 ParseConfiguration (com.joliciel.talismane.parser.ParseConfiguration)5 PosTaggerContext (com.joliciel.talismane.posTagger.PosTaggerContext)5 PosTaggerContextImpl (com.joliciel.talismane.posTagger.PosTaggerContextImpl)5 SentenceAnnotator (com.joliciel.talismane.sentenceAnnotators.SentenceAnnotator)5