Search in sources :

Example 6 with PosTagSequence

use of com.joliciel.talismane.posTagger.PosTagSequence in project talismane by joliciel-informatique.

the class PosTagComparator method evaluate.

/**
 * Evaluate the evaluation corpus against the reference corpus.
 *
 * @throws TalismaneException
 * @throws IOException
 */
public void evaluate() throws TalismaneException, IOException {
    while (referenceCorpusReader.hasNextSentence()) {
        PosTagSequence realPosTagSequence = referenceCorpusReader.nextPosTagSequence();
        PosTagSequence guessedPosTagSequence = evaluationCorpusReader.nextPosTagSequence();
        List<PosTagSequence> guessedSequences = new ArrayList<PosTagSequence>();
        guessedSequences.add(guessedPosTagSequence);
        for (PosTagEvaluationObserver observer : this.observers) {
            observer.onNextPosTagSequence(realPosTagSequence, guessedSequences);
        }
    }
    for (PosTagEvaluationObserver observer : this.observers) {
        observer.onEvaluationComplete();
    }
}
Also used : ArrayList(java.util.ArrayList) PosTagSequence(com.joliciel.talismane.posTagger.PosTagSequence)

Example 7 with PosTagSequence

use of com.joliciel.talismane.posTagger.PosTagSequence in project talismane by joliciel-informatique.

the class PosTagFScoreCalculator method onNextPosTagSequence.

@Override
public void onNextPosTagSequence(PosTagSequence realSequence, List<PosTagSequence> guessedSequences) throws TalismaneException {
    PosTagSequence guessedSequence = guessedSequences.get(0);
    int j = 0;
    for (int i = 0; i < realSequence.size(); i++) {
        TaggedToken<PosTag> realToken = realSequence.get(i);
        TaggedToken<PosTag> testToken = guessedSequence.get(j);
        // special handling for null tags & empty tokens
        if (realToken.getTag().equals(PosTag.NULL_POS_TAG)) {
            // required comparisons.
            if (testToken.getToken().isEmpty()) {
                j++;
            }
            continue;
        } else if (testToken.getToken().isEmpty() && !realToken.getToken().isEmpty()) {
            // If the test token is empty, but the real token isn't, we skip
            // this as well
            // Again, we assume the previous non-empty token took care of
            // any required comparisons.
            j++;
            testToken = guessedSequence.get(j);
        }
        boolean tokenError = false;
        if (realToken.getToken().getStartIndex() == testToken.getToken().getStartIndex() && realToken.getToken().getEndIndex() == testToken.getToken().getEndIndex()) {
            // no token error
            j++;
            if (j == guessedSequence.size()) {
                j--;
            }
        } else {
            tokenError = true;
            while (realToken.getToken().getEndIndex() >= testToken.getToken().getEndIndex()) {
                j++;
                if (j == guessedSequence.size()) {
                    j--;
                    break;
                }
                testToken = guessedSequence.get(j);
            }
        }
        if (LOG.isTraceEnabled()) {
            LOG.trace("Token " + testToken.getToken().getAnalyisText() + ", guessed: " + testToken.getTag().getCode() + " (" + testToken.getDecision().getProbability() + "), actual: " + realToken.getTag().getCode());
        }
        String result = testToken.getTag().getCode();
        if (tokenError)
            result = "TOKEN_ERROR";
        fScoreCalculator.increment(realToken.getTag().getCode(), result);
        if (testToken.getToken().getPossiblePosTags() == null || testToken.getToken().getPossiblePosTags().size() == 0)
            fscoreUnknownInLexicon.increment(realToken.getTag().getCode(), result);
        else
            fscoreKnownInLexicon.increment(realToken.getTag().getCode(), result);
    }
}
Also used : PosTag(com.joliciel.talismane.posTagger.PosTag) PosTagSequence(com.joliciel.talismane.posTagger.PosTagSequence)

Example 8 with PosTagSequence

use of com.joliciel.talismane.posTagger.PosTagSequence in project talismane by joliciel-informatique.

the class PosTagLexicalCoverageTester method onNextPosTagSequence.

@Override
public void onNextPosTagSequence(PosTagSequence realSequence, List<PosTagSequence> guessedSequences) throws TalismaneException {
    PosTagSequence guessedSequence = guessedSequences.get(0);
    for (int i = 0; i < realSequence.size(); i++) {
        TaggedToken<PosTag> realToken = realSequence.get(i);
        TaggedToken<PosTag> testToken = guessedSequence.get(i);
        boolean tokenUnknown = realToken.getToken().getPossiblePosTags() != null && realToken.getToken().getPossiblePosTags().size() == 0;
        if (tokenUnknown) {
            fscoreUnknownInLexicon.increment(realToken.getTag().getCode(), testToken.getTag().getCode());
            unknownWordCount++;
            Integer countObj = unknownWords.get(realToken.getTag() + "|" + realToken.getToken().getAnalyisText());
            int count = countObj == null ? 0 : countObj.intValue();
            unknownWords.put(realToken.getTag() + "|" + realToken.getToken().getAnalyisText(), count + 1);
        } else {
            knownWordCount++;
            knownWords.add(realToken.getToken().getAnalyisText());
        }
        if (realToken.getTag().getOpenClassIndicator().isClosed() && !realToken.getToken().getPossiblePosTags().contains(realToken.getTag())) {
            closedCategoryMismatches.add(realToken.getTag() + "|" + realToken.getToken().getAnalyisText());
        }
    }
}
Also used : PosTag(com.joliciel.talismane.posTagger.PosTag) PosTagSequence(com.joliciel.talismane.posTagger.PosTagSequence)

Example 9 with PosTagSequence

use of com.joliciel.talismane.posTagger.PosTagSequence in project talismane by joliciel-informatique.

the class PosTaggerEvaluator method evaluate.

/**
 * Evaluate a given pos tagger.
 *
 * @throws TalismaneException
 * @throws IOException
 */
public void evaluate() throws TalismaneException, IOException {
    while (corpusReader.hasNextSentence()) {
        PosTagSequence realPosTagSequence = corpusReader.nextPosTagSequence();
        List<TokenSequence> tokenSequences = null;
        List<PosTagSequence> guessedSequences = null;
        TokenSequence tokenSequence = realPosTagSequence.getTokenSequence();
        PosTagSequence guessedSequence = null;
        if (this.tokeniser != null) {
            Sentence sentence = tokenSequence.getSentence();
            tokenSequences = tokeniser.tokenise(sentence);
            tokenSequence = tokenSequences.get(0);
        } else {
            tokenSequences = new ArrayList<TokenSequence>();
            tokenSequences.add(tokenSequence);
        }
        if (posTagger instanceof NonDeterministicPosTagger) {
            NonDeterministicPosTagger nonDeterministicPosTagger = (NonDeterministicPosTagger) posTagger;
            guessedSequences = nonDeterministicPosTagger.tagSentence(tokenSequences);
            guessedSequence = guessedSequences.get(0);
        } else {
            guessedSequence = posTagger.tagSentence(tokenSequence);
        }
        if (LOG.isDebugEnabled()) {
            StringBuilder stringBuilder = new StringBuilder();
            for (PosTaggedToken posTaggedToken : guessedSequence) {
                Set<String> lemmas = new TreeSet<String>();
                stringBuilder.append(posTaggedToken.getToken().getOriginalText());
                stringBuilder.append("[" + posTaggedToken.getTag());
                List<LexicalEntry> entries = posTaggedToken.getLexicalEntries();
                boolean dropCurrentWord = false;
                if (entries.size() > 1)
                    dropCurrentWord = true;
                for (LexicalEntry entry : posTaggedToken.getLexicalEntries()) {
                    if (!lemmas.contains(entry.getLemma())) {
                        if (dropCurrentWord && posTaggedToken.getToken().getText().equals(entry.getLemma())) {
                            dropCurrentWord = false;
                            continue;
                        }
                        stringBuilder.append("|" + entry.getLemma());
                        // stringBuilder.append("/" + entry.getCategory());
                        stringBuilder.append("/" + entry.getMorphology());
                        lemmas.add(entry.getLemma());
                    }
                }
                stringBuilder.append("] ");
            }
            LOG.debug(stringBuilder.toString());
        }
        for (PosTagEvaluationObserver observer : this.observers) {
            observer.onNextPosTagSequence(realPosTagSequence, guessedSequences);
        }
    }
    for (PosTagEvaluationObserver observer : this.observers) {
        observer.onEvaluationComplete();
    }
}
Also used : PosTaggedToken(com.joliciel.talismane.posTagger.PosTaggedToken) NonDeterministicPosTagger(com.joliciel.talismane.posTagger.NonDeterministicPosTagger) TreeSet(java.util.TreeSet) PosTagSequence(com.joliciel.talismane.posTagger.PosTagSequence) LexicalEntry(com.joliciel.talismane.lexicon.LexicalEntry) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) Sentence(com.joliciel.talismane.rawText.Sentence)

Example 10 with PosTagSequence

use of com.joliciel.talismane.posTagger.PosTagSequence in project talismane by joliciel-informatique.

the class TokenSearchFeature method check.

@Override
public FeatureResult<PosTaggedTokenWrapper> check(ParseConfigurationWrapper context, RuntimeEnvironment env) throws TalismaneException {
    FeatureResult<PosTaggedTokenWrapper> featureResult = null;
    PosTagSequence posTagSequence = context.getParseConfiguration().getPosTagSequence();
    int startIndex = 0;
    int endIndex = posTagSequence.size() - 1;
    if (startIndexFeature != null) {
        FeatureResult<Integer> startIndexResult = startIndexFeature.check(context, env);
        if (startIndexResult != null) {
            startIndex = startIndexResult.getOutcome();
        } else {
            return featureResult;
        }
    }
    if (endIndexFeature != null) {
        FeatureResult<Integer> endIndexResult = endIndexFeature.check(context, env);
        if (endIndexResult != null) {
            endIndex = endIndexResult.getOutcome();
        } else {
            return featureResult;
        }
    }
    if (startIndex < 0)
        startIndex = 0;
    if (endIndex < 0)
        endIndex = 0;
    if (startIndex >= posTagSequence.size())
        startIndex = posTagSequence.size() - 1;
    if (endIndex >= posTagSequence.size())
        endIndex = posTagSequence.size() - 1;
    int step = -1;
    if (endIndex > startIndex)
        step = 1;
    PosTaggedToken matchingToken = null;
    boolean findFirst = true;
    if (findFirstFeature != null) {
        FeatureResult<Boolean> findFirstResult = this.findFirstFeature.check(context, env);
        if (findFirstResult == null) {
            return null;
        }
        findFirst = findFirstResult.getOutcome();
    }
    ParseConfigurationAddress parseConfigurationAddress = new ParseConfigurationAddress(env);
    parseConfigurationAddress.setParseConfiguration(context.getParseConfiguration());
    int currentSkip = -1;
    for (int i = startIndex; (step < 0 && i >= 0 && i >= endIndex) || (step > 0 && i < posTagSequence.size() && i <= endIndex); i += step) {
        PosTaggedToken oneToken = posTagSequence.get(i);
        parseConfigurationAddress.setPosTaggedToken(oneToken);
        if (currentSkip < 0) {
            FeatureResult<Boolean> criterionResult = this.criterion.check(parseConfigurationAddress, env);
            if (criterionResult != null && criterionResult.getOutcome()) {
                matchingToken = oneToken;
                if (findFirst)
                    break;
            }
        }
        boolean endSkip = false;
        if (skipCriteria != null && skipCriteria.length > 0) {
            if (currentSkip < 0) {
                for (int j = 0; j < skipCriteria.length; j += 2) {
                    BooleanFeature<PosTaggedTokenWrapper> skipCriterion = skipCriteria[j];
                    FeatureResult<Boolean> skipResult = skipCriterion.check(parseConfigurationAddress, env);
                    if (skipResult != null && skipResult.getOutcome()) {
                        currentSkip = j;
                        break;
                    }
                }
            } else {
                int j = currentSkip + 1;
                BooleanFeature<PosTaggedTokenWrapper> endSkipCriterion = skipCriteria[j];
                FeatureResult<Boolean> endSkipResult = endSkipCriterion.check(parseConfigurationAddress, env);
                if (endSkipResult != null && endSkipResult.getOutcome()) {
                    endSkip = true;
                }
            }
        }
        if (currentSkip < 0) {
            if (stopCriterion != null) {
                FeatureResult<Boolean> stopCriterionResult = this.stopCriterion.check(parseConfigurationAddress, env);
                if (stopCriterionResult != null && stopCriterionResult.getOutcome()) {
                    break;
                }
            }
        }
        if (endSkip)
            currentSkip = -1;
    }
    if (matchingToken != null) {
        featureResult = this.generateResult(matchingToken);
    }
    return featureResult;
}
Also used : PosTaggedToken(com.joliciel.talismane.posTagger.PosTaggedToken) PosTaggedTokenWrapper(com.joliciel.talismane.posTagger.features.PosTaggedTokenWrapper) PosTagSequence(com.joliciel.talismane.posTagger.PosTagSequence)

Aggregations

PosTagSequence (com.joliciel.talismane.posTagger.PosTagSequence)23 PosTaggedToken (com.joliciel.talismane.posTagger.PosTaggedToken)14 TokenSequence (com.joliciel.talismane.tokeniser.TokenSequence)13 Sentence (com.joliciel.talismane.rawText.Sentence)12 Decision (com.joliciel.talismane.machineLearning.Decision)8 RuntimeEnvironment (com.joliciel.talismane.machineLearning.features.RuntimeEnvironment)7 ParseConfiguration (com.joliciel.talismane.parser.ParseConfiguration)7 Token (com.joliciel.talismane.tokeniser.Token)7 TalismaneTest (com.joliciel.talismane.TalismaneTest)6 PosTaggerContext (com.joliciel.talismane.posTagger.PosTaggerContext)6 PosTaggerContextImpl (com.joliciel.talismane.posTagger.PosTaggerContextImpl)6 Config (com.typesafe.config.Config)6 StringLiteralFeature (com.joliciel.talismane.machineLearning.features.StringLiteralFeature)5 List (java.util.List)5 Test (org.junit.Test)5 TalismaneException (com.joliciel.talismane.TalismaneException)4 DependencyArc (com.joliciel.talismane.parser.DependencyArc)4 SentenceAnnotator (com.joliciel.talismane.sentenceAnnotators.SentenceAnnotator)4 NonDeterministicPosTagger (com.joliciel.talismane.posTagger.NonDeterministicPosTagger)3 PosTag (com.joliciel.talismane.posTagger.PosTag)3