Search in sources :

Example 6 with TokenPattern

use of com.joliciel.talismane.tokeniser.patterns.TokenPattern in project talismane by joliciel-informatique.

the class PatternWordFormFeature method checkInternal.

@Override
public FeatureResult<String> checkInternal(TokenWrapper tokenWrapper, RuntimeEnvironment env) throws TalismaneException {
    Token token = tokenWrapper.getToken();
    FeatureResult<String> result = null;
    FeatureResult<String> tokenPatternResult = tokenPatternFeature.check(tokenWrapper, env);
    if (tokenPatternResult != null) {
        // If we have a token pattern, then this is the first token to be
        // tested in that pattern
        TokenPattern tokenPattern = this.patternMap.get(tokenPatternResult.getOutcome());
        TokenPatternMatch theMatch = null;
        for (TokenPatternMatch tokenMatch : token.getMatches(tokenPattern)) {
            if (tokenMatch.getPattern().equals(tokenPattern) && tokenMatch.getIndex() == tokenPattern.getIndexesToTest().get(0)) {
                theMatch = tokenMatch;
                break;
            }
        }
        if (theMatch != null) {
            String unigram = "";
            for (int i = 0; i < tokenPattern.getTokenCount(); i++) {
                int index = token.getIndexWithWhiteSpace() - theMatch.getIndex() + i;
                Token aToken = token.getTokenSequence().listWithWhiteSpace().get(index);
                unigram += aToken.getAnalyisText();
            }
            result = this.generateResult(unigram);
        }
    // the current token matches the tokeniserPattern at it's first
    // test index
    }
    return result;
}
Also used : Token(com.joliciel.talismane.tokeniser.Token) TokenPatternMatch(com.joliciel.talismane.tokeniser.patterns.TokenPatternMatch) TokenPattern(com.joliciel.talismane.tokeniser.patterns.TokenPattern)

Example 7 with TokenPattern

use of com.joliciel.talismane.tokeniser.patterns.TokenPattern in project talismane by joliciel-informatique.

the class TokenComparator method compare.

/**
 * Evaluate the evaluation corpus against the reference corpus.
 *
 * @throws TalismaneException
 * @throws IOException
 */
public void compare() throws TalismaneException, IOException {
    while (referenceCorpusReader.hasNextSentence()) {
        TokenSequence realSequence = referenceCorpusReader.nextTokenSequence();
        TokenSequence guessedSequence = null;
        if (evaluationCorpusReader.hasNextSentence())
            guessedSequence = evaluationCorpusReader.nextTokenSequence();
        else {
            throw new TalismaneException("Wrong number of sentences in eval corpus: " + realSequence.getSentence().getText());
        }
        Sentence sentence = realSequence.getSentence();
        // Initially, separate the sentence into tokens using the separators
        // provided
        TokenSequence realAtomicSequence = new TokenSequence(sentence, sessionId);
        realAtomicSequence.findDefaultTokens();
        TokenSequence guessedAtomicSequence = new TokenSequence(guessedSequence.getSentence(), sessionId);
        guessedAtomicSequence.findDefaultTokens();
        List<TokenPatternMatchSequence> matchingSequences = new ArrayList<TokenPatternMatchSequence>();
        Map<Token, Set<TokenPatternMatchSequence>> tokenMatchSequenceMap = new HashMap<Token, Set<TokenPatternMatchSequence>>();
        Set<Token> matchedTokens = new HashSet<Token>();
        for (TokenPattern parsedPattern : tokeniserPatternManager.getParsedTestPatterns()) {
            List<TokenPatternMatchSequence> matchesForThisPattern = parsedPattern.match(realAtomicSequence);
            for (TokenPatternMatchSequence matchSequence : matchesForThisPattern) {
                matchingSequences.add(matchSequence);
                matchedTokens.addAll(matchSequence.getTokensToCheck());
                Token token = null;
                for (Token aToken : matchSequence.getTokensToCheck()) {
                    token = aToken;
                    if (!aToken.isWhiteSpace()) {
                        break;
                    }
                }
                Set<TokenPatternMatchSequence> matchSequences = tokenMatchSequenceMap.get(token);
                if (matchSequences == null) {
                    matchSequences = new TreeSet<TokenPatternMatchSequence>();
                    tokenMatchSequenceMap.put(token, matchSequences);
                }
                matchSequences.add(matchSequence);
            }
        }
        TokenisedAtomicTokenSequence guess = new TokenisedAtomicTokenSequence(realSequence.getSentence(), 0, sessionId);
        int i = 0;
        int mismatches = 0;
        for (Token token : realAtomicSequence) {
            if (!token.getText().equals(guessedAtomicSequence.get(i).getToken().getText())) {
                // skipped stuff at start of sentence on guess, if it's been
                // through the parser
                TokeniserOutcome outcome = TokeniserOutcome.SEPARATE;
                Decision decision = new Decision(outcome.name());
                decision.addAuthority("_" + this.getClass().getSimpleName());
                Set<TokenPatternMatchSequence> matchSequences = tokenMatchSequenceMap.get(token);
                if (matchSequences != null) {
                    decision.addAuthority("_Patterns");
                    for (TokenPatternMatchSequence matchSequence : matchSequences) {
                        decision.addAuthority(matchSequence.getTokenPattern().getName());
                    }
                }
                guess.addTaggedToken(token, decision, outcome);
                mismatches++;
                LOG.debug("Mismatch: '" + token.getText() + "', '" + guessedAtomicSequence.get(i).getToken().getText() + "'");
                if (mismatches > 6) {
                    LOG.info("Real sequence: " + realSequence.getSentence().getText());
                    LOG.info("Guessed sequence: " + guessedSequence.getSentence().getText());
                    throw new TalismaneException("Too many mismatches for sentence: " + realSequence.getSentence().getText());
                }
                continue;
            }
            TokeniserOutcome outcome = TokeniserOutcome.JOIN;
            if (guessedSequence.getTokenSplits().contains(guessedAtomicSequence.get(i).getToken().getStartIndex())) {
                outcome = TokeniserOutcome.SEPARATE;
            }
            Decision decision = new Decision(outcome.name());
            decision.addAuthority("_" + this.getClass().getSimpleName());
            Set<TokenPatternMatchSequence> matchSequences = tokenMatchSequenceMap.get(token);
            if (matchSequences != null) {
                decision.addAuthority("_Patterns");
                for (TokenPatternMatchSequence matchSequence : matchSequences) {
                    decision.addAuthority(matchSequence.getTokenPattern().getName());
                }
            }
            guess.addTaggedToken(token, decision, outcome);
            i++;
        }
        List<TokenisedAtomicTokenSequence> guessedAtomicSequences = new ArrayList<TokenisedAtomicTokenSequence>();
        guessedAtomicSequences.add(guess);
        for (TokenEvaluationObserver observer : observers) {
            observer.onNextTokenSequence(realSequence, guessedAtomicSequences);
        }
    }
    for (TokenEvaluationObserver observer : observers) {
        observer.onEvaluationComplete();
    }
}
Also used : TreeSet(java.util.TreeSet) HashSet(java.util.HashSet) Set(java.util.Set) TalismaneException(com.joliciel.talismane.TalismaneException) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Token(com.joliciel.talismane.tokeniser.Token) TokeniserOutcome(com.joliciel.talismane.tokeniser.TokeniserOutcome) Decision(com.joliciel.talismane.machineLearning.Decision) TokenPattern(com.joliciel.talismane.tokeniser.patterns.TokenPattern) TokenPatternMatchSequence(com.joliciel.talismane.tokeniser.patterns.TokenPatternMatchSequence) TokenisedAtomicTokenSequence(com.joliciel.talismane.tokeniser.TokenisedAtomicTokenSequence) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) Sentence(com.joliciel.talismane.rawText.Sentence) HashSet(java.util.HashSet) TokenisedAtomicTokenSequence(com.joliciel.talismane.tokeniser.TokenisedAtomicTokenSequence)

Aggregations

TokenPattern (com.joliciel.talismane.tokeniser.patterns.TokenPattern)7 Token (com.joliciel.talismane.tokeniser.Token)5 TalismaneException (com.joliciel.talismane.TalismaneException)2 TokenPatternMatch (com.joliciel.talismane.tokeniser.patterns.TokenPatternMatch)2 Decision (com.joliciel.talismane.machineLearning.Decision)1 Sentence (com.joliciel.talismane.rawText.Sentence)1 TokenSequence (com.joliciel.talismane.tokeniser.TokenSequence)1 TokenisedAtomicTokenSequence (com.joliciel.talismane.tokeniser.TokenisedAtomicTokenSequence)1 TokeniserOutcome (com.joliciel.talismane.tokeniser.TokeniserOutcome)1 TokenPatternMatchSequence (com.joliciel.talismane.tokeniser.patterns.TokenPatternMatchSequence)1 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 HashSet (java.util.HashSet)1 Set (java.util.Set)1 TreeSet (java.util.TreeSet)1