Search in sources :

Example 61 with Token

use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.

the class TokenComparator method compare.

/**
 * Evaluate the evaluation corpus against the reference corpus.
 *
 * @throws TalismaneException
 * @throws IOException
 */
public void compare() throws TalismaneException, IOException {
    while (referenceCorpusReader.hasNextSentence()) {
        TokenSequence realSequence = referenceCorpusReader.nextTokenSequence();
        TokenSequence guessedSequence = null;
        if (evaluationCorpusReader.hasNextSentence())
            guessedSequence = evaluationCorpusReader.nextTokenSequence();
        else {
            throw new TalismaneException("Wrong number of sentences in eval corpus: " + realSequence.getSentence().getText());
        }
        Sentence sentence = realSequence.getSentence();
        // Initially, separate the sentence into tokens using the separators
        // provided
        TokenSequence realAtomicSequence = new TokenSequence(sentence, sessionId);
        realAtomicSequence.findDefaultTokens();
        TokenSequence guessedAtomicSequence = new TokenSequence(guessedSequence.getSentence(), sessionId);
        guessedAtomicSequence.findDefaultTokens();
        List<TokenPatternMatchSequence> matchingSequences = new ArrayList<TokenPatternMatchSequence>();
        Map<Token, Set<TokenPatternMatchSequence>> tokenMatchSequenceMap = new HashMap<Token, Set<TokenPatternMatchSequence>>();
        Set<Token> matchedTokens = new HashSet<Token>();
        for (TokenPattern parsedPattern : tokeniserPatternManager.getParsedTestPatterns()) {
            List<TokenPatternMatchSequence> matchesForThisPattern = parsedPattern.match(realAtomicSequence);
            for (TokenPatternMatchSequence matchSequence : matchesForThisPattern) {
                matchingSequences.add(matchSequence);
                matchedTokens.addAll(matchSequence.getTokensToCheck());
                Token token = null;
                for (Token aToken : matchSequence.getTokensToCheck()) {
                    token = aToken;
                    if (!aToken.isWhiteSpace()) {
                        break;
                    }
                }
                Set<TokenPatternMatchSequence> matchSequences = tokenMatchSequenceMap.get(token);
                if (matchSequences == null) {
                    matchSequences = new TreeSet<TokenPatternMatchSequence>();
                    tokenMatchSequenceMap.put(token, matchSequences);
                }
                matchSequences.add(matchSequence);
            }
        }
        TokenisedAtomicTokenSequence guess = new TokenisedAtomicTokenSequence(realSequence.getSentence(), 0, sessionId);
        int i = 0;
        int mismatches = 0;
        for (Token token : realAtomicSequence) {
            if (!token.getText().equals(guessedAtomicSequence.get(i).getToken().getText())) {
                // skipped stuff at start of sentence on guess, if it's been
                // through the parser
                TokeniserOutcome outcome = TokeniserOutcome.SEPARATE;
                Decision decision = new Decision(outcome.name());
                decision.addAuthority("_" + this.getClass().getSimpleName());
                Set<TokenPatternMatchSequence> matchSequences = tokenMatchSequenceMap.get(token);
                if (matchSequences != null) {
                    decision.addAuthority("_Patterns");
                    for (TokenPatternMatchSequence matchSequence : matchSequences) {
                        decision.addAuthority(matchSequence.getTokenPattern().getName());
                    }
                }
                guess.addTaggedToken(token, decision, outcome);
                mismatches++;
                LOG.debug("Mismatch: '" + token.getText() + "', '" + guessedAtomicSequence.get(i).getToken().getText() + "'");
                if (mismatches > 6) {
                    LOG.info("Real sequence: " + realSequence.getSentence().getText());
                    LOG.info("Guessed sequence: " + guessedSequence.getSentence().getText());
                    throw new TalismaneException("Too many mismatches for sentence: " + realSequence.getSentence().getText());
                }
                continue;
            }
            TokeniserOutcome outcome = TokeniserOutcome.JOIN;
            if (guessedSequence.getTokenSplits().contains(guessedAtomicSequence.get(i).getToken().getStartIndex())) {
                outcome = TokeniserOutcome.SEPARATE;
            }
            Decision decision = new Decision(outcome.name());
            decision.addAuthority("_" + this.getClass().getSimpleName());
            Set<TokenPatternMatchSequence> matchSequences = tokenMatchSequenceMap.get(token);
            if (matchSequences != null) {
                decision.addAuthority("_Patterns");
                for (TokenPatternMatchSequence matchSequence : matchSequences) {
                    decision.addAuthority(matchSequence.getTokenPattern().getName());
                }
            }
            guess.addTaggedToken(token, decision, outcome);
            i++;
        }
        List<TokenisedAtomicTokenSequence> guessedAtomicSequences = new ArrayList<TokenisedAtomicTokenSequence>();
        guessedAtomicSequences.add(guess);
        for (TokenEvaluationObserver observer : observers) {
            observer.onNextTokenSequence(realSequence, guessedAtomicSequences);
        }
    }
    for (TokenEvaluationObserver observer : observers) {
        observer.onEvaluationComplete();
    }
}
Also used : TreeSet(java.util.TreeSet) HashSet(java.util.HashSet) Set(java.util.Set) TalismaneException(com.joliciel.talismane.TalismaneException) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Token(com.joliciel.talismane.tokeniser.Token) TokeniserOutcome(com.joliciel.talismane.tokeniser.TokeniserOutcome) Decision(com.joliciel.talismane.machineLearning.Decision) TokenPattern(com.joliciel.talismane.tokeniser.patterns.TokenPattern) TokenPatternMatchSequence(com.joliciel.talismane.tokeniser.patterns.TokenPatternMatchSequence) TokenisedAtomicTokenSequence(com.joliciel.talismane.tokeniser.TokenisedAtomicTokenSequence) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) Sentence(com.joliciel.talismane.rawText.Sentence) HashSet(java.util.HashSet) TokenisedAtomicTokenSequence(com.joliciel.talismane.tokeniser.TokenisedAtomicTokenSequence)

Example 62 with Token

use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.

the class BackwardSearchFeature method checkInternal.

@Override
public FeatureResult<TokenWrapper> checkInternal(TokenWrapper tokenWrapper, RuntimeEnvironment env) throws TalismaneException {
    TokenWrapper innerWrapper = this.getToken(tokenWrapper, env);
    if (innerWrapper == null)
        return null;
    Token token = innerWrapper.getToken();
    FeatureResult<TokenWrapper> featureResult = null;
    int startIndex = token.getIndex() - 1;
    int endIndex = 0;
    if (startIndexFeature != null) {
        FeatureResult<Integer> startIndexResult = startIndexFeature.check(innerWrapper, env);
        if (startIndexResult != null) {
            startIndex = startIndexResult.getOutcome();
        } else {
            return null;
        }
    }
    if (endIndexFeature != null) {
        FeatureResult<Integer> endIndexResult = endIndexFeature.check(innerWrapper, env);
        if (endIndexResult != null) {
            endIndex = endIndexResult.getOutcome();
        } else {
            return null;
        }
    }
    if (startIndex < 0)
        return null;
    if (endIndex >= token.getTokenSequence().size())
        return null;
    if (endIndex > startIndex)
        return null;
    if (startIndex >= token.getTokenSequence().size())
        startIndex = token.getTokenSequence().size() - 1;
    Token matchingToken = null;
    for (int i = startIndex; i >= 0 && i >= endIndex; i--) {
        Token oneToken = token.getTokenSequence().get(i);
        FeatureResult<Boolean> criterionResult = this.criterion.check(oneToken, env);
        if (criterionResult != null && criterionResult.getOutcome()) {
            matchingToken = oneToken;
            break;
        }
        if (stopCriterion != null) {
            FeatureResult<Boolean> stopCriterionResult = this.stopCriterion.check(oneToken, env);
            if (stopCriterionResult != null && stopCriterionResult.getOutcome()) {
                break;
            }
        }
    }
    if (matchingToken != null) {
        featureResult = this.generateResult(matchingToken);
    }
    return featureResult;
}
Also used : Token(com.joliciel.talismane.tokeniser.Token)

Example 63 with Token

use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.

the class CountIfFeature method checkInternal.

@Override
public FeatureResult<Integer> checkInternal(TokenWrapper tokenWrapper, RuntimeEnvironment env) throws TalismaneException {
    TokenWrapper innerWrapper = this.getToken(tokenWrapper, env);
    if (innerWrapper == null)
        return null;
    Token token = innerWrapper.getToken();
    FeatureResult<Integer> featureResult = null;
    int startIndex = 0;
    int endIndex = token.getTokenSequence().size();
    FeatureResult<Integer> startIndexResult = startIndexFeature.check(innerWrapper, env);
    if (startIndexResult != null) {
        startIndex = startIndexResult.getOutcome();
    } else {
        return null;
    }
    if (endIndexFeature != null) {
        FeatureResult<Integer> endIndexResult = endIndexFeature.check(innerWrapper, env);
        if (endIndexResult != null) {
            endIndex = endIndexResult.getOutcome();
        } else {
            return null;
        }
    }
    if (endIndex < startIndex)
        return null;
    if (startIndex <= 0)
        startIndex = 0;
    int count = 0;
    for (int i = startIndex; i < token.getTokenSequence().size() && i <= endIndex; i++) {
        Token oneToken = token.getTokenSequence().get(i);
        FeatureResult<Boolean> criterionResult = this.criterion.check(oneToken, env);
        if (criterionResult != null && criterionResult.getOutcome()) {
            count++;
        }
    }
    featureResult = this.generateResult(count);
    return featureResult;
}
Also used : Token(com.joliciel.talismane.tokeniser.Token)

Example 64 with Token

use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.

the class ForwardSearchFeature method checkInternal.

@Override
public FeatureResult<TokenWrapper> checkInternal(TokenWrapper tokenWrapper, RuntimeEnvironment env) throws TalismaneException {
    TokenWrapper innerWrapper = this.getToken(tokenWrapper, env);
    if (innerWrapper == null)
        return null;
    Token token = innerWrapper.getToken();
    FeatureResult<TokenWrapper> featureResult = null;
    int startIndex = token.getIndex() + 1;
    int endIndex = token.getTokenSequence().size();
    if (startIndexFeature != null) {
        FeatureResult<Integer> startIndexResult = startIndexFeature.check(innerWrapper, env);
        if (startIndexResult != null) {
            startIndex = startIndexResult.getOutcome();
        } else {
            return null;
        }
    }
    if (endIndexFeature != null) {
        FeatureResult<Integer> endIndexResult = endIndexFeature.check(innerWrapper, env);
        if (endIndexResult != null) {
            endIndex = endIndexResult.getOutcome();
        } else {
            return null;
        }
    }
    if (startIndex >= token.getTokenSequence().size())
        return null;
    if (endIndex < 0)
        return null;
    if (endIndex < startIndex)
        return null;
    if (startIndex < 0)
        startIndex = 0;
    Token matchingToken = null;
    for (int i = startIndex; i < token.getTokenSequence().size() && i <= endIndex; i++) {
        Token oneToken = token.getTokenSequence().get(i);
        FeatureResult<Boolean> criterionResult = this.criterion.check(oneToken, env);
        if (criterionResult != null && criterionResult.getOutcome()) {
            matchingToken = oneToken;
            break;
        }
        if (stopCriterion != null) {
            FeatureResult<Boolean> stopCriterionResult = this.stopCriterion.check(oneToken, env);
            if (stopCriterionResult != null && stopCriterionResult.getOutcome()) {
                break;
            }
        }
    }
    if (matchingToken != null) {
        featureResult = this.generateResult(matchingToken);
    }
    return featureResult;
}
Also used : Token(com.joliciel.talismane.tokeniser.Token)

Example 65 with Token

use of com.joliciel.talismane.tokeniser.Token in project talismane by joliciel-informatique.

the class LastWordInCompoundFeature method checkInternal.

@Override
public FeatureResult<String> checkInternal(TokenWrapper tokenWrapper, RuntimeEnvironment env) throws TalismaneException {
    TokenWrapper innerWrapper = this.getToken(tokenWrapper, env);
    if (innerWrapper == null)
        return null;
    Token token = innerWrapper.getToken();
    FeatureResult<String> result = null;
    String string = token.getAnalyisText().trim();
    if (string.indexOf(' ') >= 0) {
        int lastSpace = string.lastIndexOf(' ');
        String lastWord = string.substring(lastSpace + 1);
        result = this.generateResult(lastWord);
    }
    return result;
}
Also used : Token(com.joliciel.talismane.tokeniser.Token)

Aggregations

Token (com.joliciel.talismane.tokeniser.Token)69 TokenSequence (com.joliciel.talismane.tokeniser.TokenSequence)16 ArrayList (java.util.ArrayList)15 Sentence (com.joliciel.talismane.rawText.Sentence)14 Decision (com.joliciel.talismane.machineLearning.Decision)12 Config (com.typesafe.config.Config)12 TalismaneTest (com.joliciel.talismane.TalismaneTest)11 PosTaggedToken (com.joliciel.talismane.posTagger.PosTaggedToken)11 Test (org.junit.Test)11 TalismaneException (com.joliciel.talismane.TalismaneException)7 RuntimeEnvironment (com.joliciel.talismane.machineLearning.features.RuntimeEnvironment)7 PosTagSequence (com.joliciel.talismane.posTagger.PosTagSequence)7 TokeniserOutcome (com.joliciel.talismane.tokeniser.TokeniserOutcome)7 List (java.util.List)7 WeightedOutcome (com.joliciel.talismane.utils.WeightedOutcome)6 HashMap (java.util.HashMap)6 StringLiteralFeature (com.joliciel.talismane.machineLearning.features.StringLiteralFeature)5 PosTag (com.joliciel.talismane.posTagger.PosTag)5 PosTaggerContext (com.joliciel.talismane.posTagger.PosTaggerContext)5 PosTaggerContextImpl (com.joliciel.talismane.posTagger.PosTaggerContextImpl)5