Search in sources :

Example 1 with TokeniserOutcome

use of com.joliciel.talismane.tokeniser.TokeniserOutcome in project talismane by joliciel-informatique.

the class PatternTokeniser method tokeniseInternal.

@Override
protected List<TokenisedAtomicTokenSequence> tokeniseInternal(TokenSequence initialSequence, Sentence sentence) throws TalismaneException, IOException {
    List<TokenisedAtomicTokenSequence> sequences;
    // Assign each separator its default value
    List<TokeniserOutcome> defaultOutcomes = this.tokeniserPatternManager.getDefaultOutcomes(initialSequence);
    List<Decision> defaultDecisions = new ArrayList<Decision>(defaultOutcomes.size());
    for (TokeniserOutcome outcome : defaultOutcomes) {
        Decision tokeniserDecision = new Decision(outcome.name());
        tokeniserDecision.addAuthority("_" + this.getClass().getSimpleName());
        tokeniserDecision.addAuthority("_" + "DefaultDecision");
        defaultDecisions.add(tokeniserDecision);
    }
    // For each test pattern, see if anything in the sentence matches it
    if (this.decisionMaker != null) {
        List<TokenPatternMatchSequence> matchingSequences = new ArrayList<TokenPatternMatchSequence>();
        Map<Token, Set<TokenPatternMatchSequence>> tokenMatchSequenceMap = new HashMap<Token, Set<TokenPatternMatchSequence>>();
        Map<TokenPatternMatchSequence, TokenPatternMatch> primaryMatchMap = new HashMap<TokenPatternMatchSequence, TokenPatternMatch>();
        Set<Token> matchedTokens = new HashSet<Token>();
        for (TokenPattern parsedPattern : this.getTokeniserPatternManager().getParsedTestPatterns()) {
            List<TokenPatternMatchSequence> matchesForThisPattern = parsedPattern.match(initialSequence);
            for (TokenPatternMatchSequence matchSequence : matchesForThisPattern) {
                if (matchSequence.getTokensToCheck().size() > 0) {
                    matchingSequences.add(matchSequence);
                    matchedTokens.addAll(matchSequence.getTokensToCheck());
                    TokenPatternMatch primaryMatch = null;
                    Token token = matchSequence.getTokensToCheck().get(0);
                    Set<TokenPatternMatchSequence> matchSequences = tokenMatchSequenceMap.get(token);
                    if (matchSequences == null) {
                        matchSequences = new TreeSet<TokenPatternMatchSequence>();
                        tokenMatchSequenceMap.put(token, matchSequences);
                    }
                    matchSequences.add(matchSequence);
                    for (TokenPatternMatch patternMatch : matchSequence.getTokenPatternMatches()) {
                        if (patternMatch.getToken().equals(token)) {
                            primaryMatch = patternMatch;
                            break;
                        }
                    }
                    if (LOG.isTraceEnabled()) {
                        LOG.trace("Found match: " + primaryMatch);
                    }
                    primaryMatchMap.put(matchSequence, primaryMatch);
                }
            }
        }
        // we want to create the n most likely token sequences
        // the sequence has to correspond to a token pattern
        Map<TokenPatternMatchSequence, List<Decision>> matchSequenceDecisionMap = new HashMap<TokenPatternMatchSequence, List<Decision>>();
        for (TokenPatternMatchSequence matchSequence : matchingSequences) {
            TokenPatternMatch match = primaryMatchMap.get(matchSequence);
            LOG.debug("next pattern match: " + match.toString());
            List<FeatureResult<?>> tokenFeatureResults = new ArrayList<FeatureResult<?>>();
            for (TokenPatternMatchFeature<?> feature : features) {
                RuntimeEnvironment env = new RuntimeEnvironment();
                FeatureResult<?> featureResult = feature.check(match, env);
                if (featureResult != null) {
                    tokenFeatureResults.add(featureResult);
                }
            }
            if (LOG.isTraceEnabled()) {
                SortedSet<String> featureResultSet = tokenFeatureResults.stream().map(f -> f.toString()).collect(Collectors.toCollection(() -> new TreeSet<String>()));
                for (String featureResultString : featureResultSet) {
                    LOG.trace(featureResultString);
                }
            }
            List<Decision> decisions = this.decisionMaker.decide(tokenFeatureResults);
            for (ClassificationObserver observer : this.observers) observer.onAnalyse(match.getToken(), tokenFeatureResults, decisions);
            for (Decision decision : decisions) {
                decision.addAuthority("_" + this.getClass().getSimpleName());
                decision.addAuthority("_" + "Patterns");
                decision.addAuthority(match.getPattern().getName());
            }
            matchSequenceDecisionMap.put(matchSequence, decisions);
        }
        // initially create a heap with a single, empty sequence
        PriorityQueue<TokenisedAtomicTokenSequence> heap = new PriorityQueue<TokenisedAtomicTokenSequence>();
        TokenisedAtomicTokenSequence emptySequence = new TokenisedAtomicTokenSequence(sentence, 0, this.getSessionId());
        heap.add(emptySequence);
        for (int i = 0; i < initialSequence.listWithWhiteSpace().size(); i++) {
            Token token = initialSequence.listWithWhiteSpace().get(i);
            if (LOG.isTraceEnabled()) {
                LOG.trace("Token : \"" + token.getAnalyisText() + "\"");
            }
            // build a new heap for this iteration
            PriorityQueue<TokenisedAtomicTokenSequence> previousHeap = heap;
            heap = new PriorityQueue<TokenisedAtomicTokenSequence>();
            if (i == 0) {
                // first token is always "separate" from the outside world
                Decision decision = new Decision(TokeniserOutcome.SEPARATE.name());
                decision.addAuthority("_" + this.getClass().getSimpleName());
                decision.addAuthority("_" + "DefaultDecision");
                TaggedToken<TokeniserOutcome> taggedToken = new TaggedToken<>(token, decision, TokeniserOutcome.valueOf(decision.getOutcome()));
                TokenisedAtomicTokenSequence newSequence = new TokenisedAtomicTokenSequence(emptySequence);
                newSequence.add(taggedToken);
                heap.add(newSequence);
                continue;
            }
            // limit the heap breadth to K
            int maxSequences = previousHeap.size() > this.getBeamWidth() ? this.getBeamWidth() : previousHeap.size();
            for (int j = 0; j < maxSequences; j++) {
                TokenisedAtomicTokenSequence history = previousHeap.poll();
                // Find the separating & non-separating decisions
                if (history.size() > i) {
                    // token already added as part of a sequence
                    // introduced by another token
                    heap.add(history);
                } else if (tokenMatchSequenceMap.containsKey(token)) {
                    // token begins one or more match sequences
                    // these are ordered from shortest to longest (via
                    // TreeSet)
                    List<TokenPatternMatchSequence> matchSequences = new ArrayList<TokenPatternMatchSequence>(tokenMatchSequenceMap.get(token));
                    // Since sequences P1..Pn contain each other,
                    // there can be exactly matchSequences.size()
                    // consistent solutions
                    // Assume the default is separate
                    // 0: all separate
                    // 1: join P1, separate rest
                    // 2: join P2, separate rest
                    // ...
                    // n: join Pn
                    // We need to add each of these to the heap
                    // by taking the product of all probabilities
                    // consistent with each solution
                    // The probabities for each solution are (j=join,
                    // s=separate)
                    // All separate: s1 x s2 x ... x sn
                    // P1: j1 x s2 x ... x sn
                    // P2: j1 x j2 x ... x sn
                    // ...
                    // Pn: j1 x j2 x ... x jn
                    // Any solution of the form s1 x j2 would be
                    // inconsistent, and is not considered
                    // If Pi and Pj start and end on the exact same
                    // token, then the solution for both is
                    // Pi: j1 x ... x ji x jj x sj+1 ... x sn
                    // Pj: j1 x ... x ji x jj x sj+1 ... x sn
                    // Note of course that we're never likely to have
                    // more than two Ps here,
                    // but we need a solution for more just to be sure
                    // to be sure
                    TokeniserOutcome defaultOutcome = TokeniserOutcome.valueOf(defaultDecisions.get(token.getIndexWithWhiteSpace()).getOutcome());
                    TokeniserOutcome otherOutcome = null;
                    if (defaultOutcome == TokeniserOutcome.SEPARATE)
                        otherOutcome = TokeniserOutcome.JOIN;
                    else
                        otherOutcome = TokeniserOutcome.SEPARATE;
                    double[] decisionProbs = new double[matchSequences.size() + 1];
                    for (int k = 0; k < decisionProbs.length; k++) decisionProbs[k] = 1;
                    // Note: k0 = default decision (e.g. separate all),
                    // k1=first pattern
                    // p1 = first pattern
                    int p = 1;
                    int prevEndIndex = -1;
                    for (TokenPatternMatchSequence matchSequence : matchSequences) {
                        int endIndex = matchSequence.getTokensToCheck().get(matchSequence.getTokensToCheck().size() - 1).getEndIndex();
                        List<Decision> decisions = matchSequenceDecisionMap.get(matchSequence);
                        for (Decision decision : decisions) {
                            for (int k = 0; k < decisionProbs.length; k++) {
                                if (decision.getOutcome().equals(defaultOutcome.name())) {
                                    // e.g. separate in most cases
                                    if (k < p && endIndex > prevEndIndex)
                                        decisionProbs[k] *= decision.getProbability();
                                    else if (k + 1 < p && endIndex <= prevEndIndex)
                                        decisionProbs[k] *= decision.getProbability();
                                } else {
                                    // e.g. join in most cases
                                    if (k >= p && endIndex > prevEndIndex)
                                        decisionProbs[k] *= decision.getProbability();
                                    else if (k + 1 >= p && endIndex <= prevEndIndex)
                                        decisionProbs[k] *= decision.getProbability();
                                }
                            }
                        // next k
                        }
                        // next decision (only 2 of these)
                        prevEndIndex = endIndex;
                        p++;
                    }
                    // transform to probability distribution
                    double sumProbs = 0;
                    for (int k = 0; k < decisionProbs.length; k++) sumProbs += decisionProbs[k];
                    if (sumProbs > 0)
                        for (int k = 0; k < decisionProbs.length; k++) decisionProbs[k] /= sumProbs;
                    // Apply default decision
                    // Since this is the default decision for all tokens
                    // in the sequence, we don't add the other tokens
                    // for now,
                    // so as to allow them
                    // to get examined one at a time, just in case one
                    // of them starts its own separate sequence
                    Decision defaultDecision = new Decision(defaultOutcome.name(), decisionProbs[0]);
                    defaultDecision.addAuthority("_" + this.getClass().getSimpleName());
                    defaultDecision.addAuthority("_" + "Patterns");
                    for (TokenPatternMatchSequence matchSequence : matchSequences) {
                        defaultDecision.addAuthority(matchSequence.getTokenPattern().getName());
                    }
                    TaggedToken<TokeniserOutcome> defaultTaggedToken = new TaggedToken<>(token, defaultDecision, TokeniserOutcome.valueOf(defaultDecision.getOutcome()));
                    TokenisedAtomicTokenSequence defaultSequence = new TokenisedAtomicTokenSequence(history);
                    defaultSequence.add(defaultTaggedToken);
                    defaultSequence.addDecision(defaultDecision);
                    heap.add(defaultSequence);
                    // Apply one non-default decision per match sequence
                    for (int k = 0; k < matchSequences.size(); k++) {
                        TokenPatternMatchSequence matchSequence = matchSequences.get(k);
                        double prob = decisionProbs[k + 1];
                        Decision decision = new Decision(otherOutcome.name(), prob);
                        decision.addAuthority("_" + this.getClass().getSimpleName());
                        decision.addAuthority("_" + "Patterns");
                        decision.addAuthority(matchSequence.getTokenPattern().getName());
                        TaggedToken<TokeniserOutcome> taggedToken = new TaggedToken<>(token, decision, TokeniserOutcome.valueOf(decision.getOutcome()));
                        TokenisedAtomicTokenSequence newSequence = new TokenisedAtomicTokenSequence(history);
                        newSequence.add(taggedToken);
                        newSequence.addDecision(decision);
                        // in this sequence to the solution
                        for (Token tokenInSequence : matchSequence.getTokensToCheck()) {
                            if (tokenInSequence.equals(token)) {
                                continue;
                            }
                            Decision decisionInSequence = new Decision(decision.getOutcome());
                            decisionInSequence.addAuthority("_" + this.getClass().getSimpleName());
                            decisionInSequence.addAuthority("_" + "DecisionInSequence");
                            decisionInSequence.addAuthority("_" + "DecisionInSequence_non_default");
                            decisionInSequence.addAuthority("_" + "Patterns");
                            TaggedToken<TokeniserOutcome> taggedTokenInSequence = new TaggedToken<>(tokenInSequence, decisionInSequence, TokeniserOutcome.valueOf(decisionInSequence.getOutcome()));
                            newSequence.add(taggedTokenInSequence);
                        }
                        heap.add(newSequence);
                    }
                // next sequence
                } else {
                    // token doesn't start match sequence, and hasn't
                    // already been added to the current sequence
                    Decision decision = defaultDecisions.get(i);
                    if (matchedTokens.contains(token)) {
                        decision = new Decision(decision.getOutcome());
                        decision.addAuthority("_" + this.getClass().getSimpleName());
                        decision.addAuthority("_" + "DecisionInSequence");
                        decision.addAuthority("_" + "DecisionInSequence_default");
                        decision.addAuthority("_" + "Patterns");
                    }
                    TaggedToken<TokeniserOutcome> taggedToken = new TaggedToken<>(token, decision, TokeniserOutcome.valueOf(decision.getOutcome()));
                    TokenisedAtomicTokenSequence newSequence = new TokenisedAtomicTokenSequence(history);
                    newSequence.add(taggedToken);
                    heap.add(newSequence);
                }
            }
        // next sequence in the old heap
        }
        // next token
        sequences = new ArrayList<TokenisedAtomicTokenSequence>();
        int k = 0;
        while (!heap.isEmpty()) {
            sequences.add(heap.poll());
            k++;
            if (k >= this.getBeamWidth())
                break;
        }
    } else {
        sequences = new ArrayList<TokenisedAtomicTokenSequence>();
        TokenisedAtomicTokenSequence defaultSequence = new TokenisedAtomicTokenSequence(sentence, 0, this.getSessionId());
        int i = 0;
        for (Token token : initialSequence.listWithWhiteSpace()) {
            Decision decision = defaultDecisions.get(i++);
            TaggedToken<TokeniserOutcome> taggedToken = new TaggedToken<>(token, decision, TokeniserOutcome.valueOf(decision.getOutcome()));
            defaultSequence.add(taggedToken);
        }
        sequences.add(defaultSequence);
    }
    // have decision maker?
    return sequences;
}
Also used : ClassificationObserver(com.joliciel.talismane.machineLearning.ClassificationObserver) ZipInputStream(java.util.zip.ZipInputStream) SortedSet(java.util.SortedSet) PriorityQueue(java.util.PriorityQueue) LoggerFactory(org.slf4j.LoggerFactory) TokenisedAtomicTokenSequence(com.joliciel.talismane.tokeniser.TokenisedAtomicTokenSequence) HashMap(java.util.HashMap) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) MachineLearningModelFactory(com.joliciel.talismane.machineLearning.MachineLearningModelFactory) TaggedToken(com.joliciel.talismane.tokeniser.TaggedToken) TreeSet(java.util.TreeSet) TalismaneException(com.joliciel.talismane.TalismaneException) TalismaneSession(com.joliciel.talismane.TalismaneSession) ArrayList(java.util.ArrayList) ClassificationModel(com.joliciel.talismane.machineLearning.ClassificationModel) HashSet(java.util.HashSet) RuntimeEnvironment(com.joliciel.talismane.machineLearning.features.RuntimeEnvironment) TokenPatternMatchFeatureParser(com.joliciel.talismane.tokeniser.features.TokenPatternMatchFeatureParser) TokenPatternMatchFeature(com.joliciel.talismane.tokeniser.features.TokenPatternMatchFeature) FeatureResult(com.joliciel.talismane.machineLearning.features.FeatureResult) Map(java.util.Map) ConfigUtils(com.joliciel.talismane.utils.ConfigUtils) ConfigFactory(com.typesafe.config.ConfigFactory) ExternalResource(com.joliciel.talismane.machineLearning.ExternalResource) DecisionMaker(com.joliciel.talismane.machineLearning.DecisionMaker) Tokeniser(com.joliciel.talismane.tokeniser.Tokeniser) Logger(org.slf4j.Logger) Config(com.typesafe.config.Config) Collection(java.util.Collection) Set(java.util.Set) IOException(java.io.IOException) TokeniserOutcome(com.joliciel.talismane.tokeniser.TokeniserOutcome) Decision(com.joliciel.talismane.machineLearning.Decision) Collectors(java.util.stream.Collectors) File(java.io.File) List(java.util.List) Token(com.joliciel.talismane.tokeniser.Token) Sentence(com.joliciel.talismane.rawText.Sentence) InputStream(java.io.InputStream) SortedSet(java.util.SortedSet) TreeSet(java.util.TreeSet) HashSet(java.util.HashSet) Set(java.util.Set) TaggedToken(com.joliciel.talismane.tokeniser.TaggedToken) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) TaggedToken(com.joliciel.talismane.tokeniser.TaggedToken) Token(com.joliciel.talismane.tokeniser.Token) TokeniserOutcome(com.joliciel.talismane.tokeniser.TokeniserOutcome) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) List(java.util.List) TokenisedAtomicTokenSequence(com.joliciel.talismane.tokeniser.TokenisedAtomicTokenSequence) HashSet(java.util.HashSet) RuntimeEnvironment(com.joliciel.talismane.machineLearning.features.RuntimeEnvironment) PriorityQueue(java.util.PriorityQueue) Decision(com.joliciel.talismane.machineLearning.Decision) ClassificationObserver(com.joliciel.talismane.machineLearning.ClassificationObserver) FeatureResult(com.joliciel.talismane.machineLearning.features.FeatureResult)

Example 2 with TokeniserOutcome

use of com.joliciel.talismane.tokeniser.TokeniserOutcome in project talismane by joliciel-informatique.

the class TokeniserPatternManager method getDefaultOutcomes.

/**
 * Takes a sequence of atomic tokens and applies default decisions for each
 * separator.
 */
public List<TokeniserOutcome> getDefaultOutcomes(TokenSequence tokenSequence) {
    List<TokeniserOutcome> defaultOutcomes = new ArrayList<TokeniserOutcome>();
    // Assign each separator its default value
    TokeniserOutcome nextOutcome = TokeniserOutcome.SEPARATE;
    Pattern tokenSeparators = Tokeniser.getTokenSeparators(sessionId);
    for (Token token : tokenSequence.listWithWhiteSpace()) {
        TokeniserOutcome outcome = null;
        if (tokenSeparators.matcher(token.getAnalyisText()).matches()) {
            boolean defaultValueFound = false;
            for (Entry<SeparatorDecision, Pattern> entry : this.getSeparatorDefaultPatterns().entrySet()) {
                if (entry.getValue().matcher(token.getAnalyisText()).matches()) {
                    defaultValueFound = true;
                    SeparatorDecision defaultSeparatorDecision = entry.getKey();
                    switch(defaultSeparatorDecision) {
                        case IS_SEPARATOR:
                            outcome = TokeniserOutcome.SEPARATE;
                            nextOutcome = TokeniserOutcome.SEPARATE;
                            break;
                        case IS_NOT_SEPARATOR:
                            outcome = TokeniserOutcome.JOIN;
                            nextOutcome = TokeniserOutcome.JOIN;
                            break;
                        case IS_SEPARATOR_BEFORE:
                            outcome = TokeniserOutcome.SEPARATE;
                            nextOutcome = TokeniserOutcome.JOIN;
                        case IS_SEPARATOR_AFTER:
                            outcome = TokeniserOutcome.JOIN;
                            nextOutcome = TokeniserOutcome.SEPARATE;
                        case NOT_APPLICABLE:
                            break;
                        default:
                            break;
                    }
                    break;
                }
            }
            if (!defaultValueFound) {
                outcome = TokeniserOutcome.SEPARATE;
                nextOutcome = TokeniserOutcome.SEPARATE;
            }
            defaultOutcomes.add(outcome);
        } else {
            defaultOutcomes.add(nextOutcome);
        }
    }
    return defaultOutcomes;
}
Also used : Pattern(java.util.regex.Pattern) SeparatorDecision(com.joliciel.talismane.tokeniser.SeparatorDecision) ArrayList(java.util.ArrayList) Token(com.joliciel.talismane.tokeniser.Token) TokeniserOutcome(com.joliciel.talismane.tokeniser.TokeniserOutcome)

Example 3 with TokeniserOutcome

use of com.joliciel.talismane.tokeniser.TokeniserOutcome in project talismane by joliciel-informatique.

the class TokenFScoreCalculator method onEvaluationComplete.

@Override
public void onEvaluationComplete() throws IOException {
    for (String tagger : taggerFScoreCalculators.keySet()) {
        LOG.debug("###### Tagger " + tagger);
        FScoreCalculator<TokeniserOutcome> taggerFScoreCalculator = taggerFScoreCalculators.get(tagger);
        LOG.debug("###### Tagger " + tagger + ": f-score = " + taggerFScoreCalculator.getTotalFScore());
    }
    if (fScoreFile != null) {
        fScoreCalculator.writeScoresToCSVFile(fScoreFile);
    }
    if (errorWriter != null) {
        try {
            for (String tagger : taggerFScoreCalculators.keySet()) {
                FScoreCalculator<TokeniserOutcome> taggerFScoreCalculator = taggerFScoreCalculators.get(tagger);
                errorWriter.write("###### Tagger " + tagger + ": f-score = " + taggerFScoreCalculator.getTotalFScore() + "\n");
                errorWriter.write("Total " + (taggerFScoreCalculator.getTotalTruePositiveCount() + taggerFScoreCalculator.getTotalFalseNegativeCount()) + "\n");
                errorWriter.write("True + " + taggerFScoreCalculator.getTotalTruePositiveCount() + "\n");
                errorWriter.write("False- " + taggerFScoreCalculator.getTotalFalseNegativeCount() + "\n");
                errorWriter.write("False+ " + taggerFScoreCalculator.getTotalFalsePositiveCount() + "\n");
                for (TokeniserOutcome outcome : taggerFScoreCalculator.getOutcomeSet()) {
                    errorWriter.write(outcome + " total  " + (taggerFScoreCalculator.getTruePositiveCount(outcome) + taggerFScoreCalculator.getFalseNegativeCount(outcome)) + "\n");
                    errorWriter.write(outcome + " true + " + (taggerFScoreCalculator.getTruePositiveCount(outcome)) + "\n");
                    errorWriter.write(outcome + " false- " + (taggerFScoreCalculator.getFalseNegativeCount(outcome)) + "\n");
                    errorWriter.write(outcome + " false+ " + (taggerFScoreCalculator.getFalsePositiveCount(outcome)) + "\n");
                    errorWriter.write(outcome + " precis " + (taggerFScoreCalculator.getPrecision(outcome)) + "\n");
                    errorWriter.write(outcome + " recall " + (taggerFScoreCalculator.getRecall(outcome)) + "\n");
                    errorWriter.write(outcome + " fscore " + (taggerFScoreCalculator.getFScore(outcome)) + "\n");
                }
                List<TokeniserErrorRecord> errors = errorMap.get(tagger);
                if (errors != null) {
                    for (TokeniserErrorRecord errorRecord : errors) {
                        errorWriter.write("guess " + errorRecord.guessDecision + ", real " + errorRecord.realDecision + ", context: " + errorRecord.context + "\n");
                    }
                }
                errorWriter.flush();
            }
        } finally {
            errorWriter.close();
        }
    }
    if (csvErrorWriter != null) {
        try {
            for (String tagger : taggerFScoreCalculators.keySet()) {
                FScoreCalculator<TokeniserOutcome> taggerFScoreCalculator = taggerFScoreCalculators.get(tagger);
                csvErrorWriter.write(CSV.format("Authority") + CSV.format("total") + CSV.format("true+") + CSV.format("false-") + CSV.format("false+") + CSV.format("precis") + CSV.format("recall") + CSV.format("fscore") + "\n");
                csvErrorWriter.write(CSV.format(tagger) + CSV.format((taggerFScoreCalculator.getTotalTruePositiveCount() + taggerFScoreCalculator.getTotalFalseNegativeCount())) + CSV.format(taggerFScoreCalculator.getTotalTruePositiveCount()) + CSV.format(taggerFScoreCalculator.getTotalFalseNegativeCount()) + CSV.format(taggerFScoreCalculator.getTotalFalsePositiveCount()) + CSV.format(taggerFScoreCalculator.getTotalPrecision()) + CSV.format(taggerFScoreCalculator.getTotalRecall()) + CSV.format(taggerFScoreCalculator.getTotalFScore()) + "\n");
                for (TokeniserOutcome outcome : taggerFScoreCalculator.getOutcomeSet()) {
                    csvErrorWriter.write(CSV.format(outcome.name()) + CSV.format((taggerFScoreCalculator.getTruePositiveCount(outcome) + taggerFScoreCalculator.getFalseNegativeCount(outcome))) + CSV.format(taggerFScoreCalculator.getTruePositiveCount(outcome)) + CSV.format(taggerFScoreCalculator.getFalseNegativeCount(outcome)) + CSV.format(taggerFScoreCalculator.getFalsePositiveCount(outcome)) + CSV.format(taggerFScoreCalculator.getPrecision(outcome)) + CSV.format(taggerFScoreCalculator.getRecall(outcome)) + CSV.format(taggerFScoreCalculator.getFScore(outcome)) + "\n");
                }
                List<TokeniserErrorRecord> errors = errorMap.get(tagger);
                if (errors != null) {
                    for (TokeniserErrorRecord errorRecord : errors) {
                        csvErrorWriter.write(CSV.format(errorRecord.guessDecision.name()));
                        csvErrorWriter.write(CSV.format(errorRecord.realDecision.name()));
                        csvErrorWriter.write(CSV.format(errorRecord.context));
                        csvErrorWriter.write("\n");
                    }
                }
                csvErrorWriter.flush();
            }
        } finally {
            errorWriter.close();
        }
    }
}
Also used : TokeniserOutcome(com.joliciel.talismane.tokeniser.TokeniserOutcome)

Example 4 with TokeniserOutcome

use of com.joliciel.talismane.tokeniser.TokeniserOutcome in project talismane by joliciel-informatique.

the class PatternEventStream method getTaggedTokens.

public List<TaggedToken<TokeniserOutcome>> getTaggedTokens(TokenSequence tokenSequence, List<Integer> tokenSplits) {
    List<TaggedToken<TokeniserOutcome>> taggedTokens = new ArrayList<TaggedToken<TokeniserOutcome>>();
    for (Token token : tokenSequence.listWithWhiteSpace()) {
        TokeniserOutcome outcome = TokeniserOutcome.JOIN;
        if (tokenSplits.contains(token.getStartIndex()))
            outcome = TokeniserOutcome.SEPARATE;
        Decision decision = new Decision(outcome.name());
        TaggedToken<TokeniserOutcome> taggedToken = new TaggedToken<>(token, decision, TokeniserOutcome.valueOf(decision.getOutcome()));
        taggedTokens.add(taggedToken);
    }
    return taggedTokens;
}
Also used : TaggedToken(com.joliciel.talismane.tokeniser.TaggedToken) ArrayList(java.util.ArrayList) TaggedToken(com.joliciel.talismane.tokeniser.TaggedToken) Token(com.joliciel.talismane.tokeniser.Token) TokeniserOutcome(com.joliciel.talismane.tokeniser.TokeniserOutcome) Decision(com.joliciel.talismane.machineLearning.Decision)

Example 5 with TokeniserOutcome

use of com.joliciel.talismane.tokeniser.TokeniserOutcome in project talismane by joliciel-informatique.

the class PatternEventStream method hasNext.

@Override
public boolean hasNext() throws TalismaneException, IOException {
    if (currentPatternMatches != null) {
        if (currentIndex == currentPatternMatches.size()) {
            currentPatternMatches = null;
        }
    }
    while (currentPatternMatches == null) {
        if (this.corpusReader.hasNextSentence()) {
            currentPatternMatches = new ArrayList<TokenPatternMatch>();
            currentOutcomes = new ArrayList<TokeniserOutcome>();
            currentIndex = 0;
            TokenSequence realSequence = corpusReader.nextTokenSequence();
            List<Integer> tokenSplits = realSequence.getTokenSplits();
            String text = realSequence.getSentence().getText().toString();
            LOG.debug("Sentence: " + text);
            Sentence sentence = new Sentence(text, sessionId);
            TokenSequence tokenSequence = new TokenSequence(sentence, sessionId);
            tokenSequence.findDefaultTokens();
            List<TokeniserOutcome> defaultOutcomes = this.tokeniserPatternManager.getDefaultOutcomes(tokenSequence);
            List<TaggedToken<TokeniserOutcome>> currentSentence = this.getTaggedTokens(tokenSequence, tokenSplits);
            // check if anything matches each pattern
            for (TokenPattern parsedPattern : this.tokeniserPatternManager.getParsedTestPatterns()) {
                List<TokenPatternMatchSequence> tokenPatternMatches = parsedPattern.match(tokenSequence);
                for (TokenPatternMatchSequence tokenPatternMatchSequence : tokenPatternMatches) {
                    if (LOG.isTraceEnabled())
                        LOG.trace("Matched pattern: " + parsedPattern + ": " + tokenPatternMatchSequence.getTokenSequence());
                    // check if entire pattern is separated or joined
                    TokeniserOutcome outcome = null;
                    TokeniserOutcome defaultOutcome = null;
                    boolean haveMismatch = false;
                    TokenPatternMatch tokenPatternMatch = null;
                    for (Token token : tokenPatternMatchSequence.getTokensToCheck()) {
                        if (tokenPatternMatch == null) {
                            for (TokenPatternMatch patternMatch : tokenPatternMatchSequence.getTokenPatternMatches()) {
                                if (patternMatch.getToken().equals(token)) {
                                    tokenPatternMatch = patternMatch;
                                    break;
                                }
                            }
                        }
                        TaggedToken<TokeniserOutcome> taggedToken = currentSentence.get(token.getIndexWithWhiteSpace());
                        if (outcome == null) {
                            outcome = taggedToken.getTag();
                            defaultOutcome = defaultOutcomes.get(token.getIndexWithWhiteSpace());
                        } else if (taggedToken.getTag() != outcome) {
                            // this should only happen when two patterns
                            // overlap:
                            // e.g. "aussi bien que" and "bien que", or
                            // "plutot que" and "plutot que de"
                            // AND the outer pattern is separated, while
                            // the inner pattern is joined
                            LOG.debug("Mismatch in pattern: " + tokenPatternMatch + ", " + taggedToken);
                            haveMismatch = true;
                        }
                    }
                    currentPatternMatches.add(tokenPatternMatch);
                    if (haveMismatch) {
                        currentOutcomes.add(defaultOutcome);
                    } else {
                        currentOutcomes.add(outcome);
                    }
                }
            }
            if (currentPatternMatches.size() == 0) {
                currentPatternMatches = null;
                currentOutcomes = null;
            }
        } else {
            break;
        }
    }
    return currentPatternMatches != null;
}
Also used : TaggedToken(com.joliciel.talismane.tokeniser.TaggedToken) TaggedToken(com.joliciel.talismane.tokeniser.TaggedToken) Token(com.joliciel.talismane.tokeniser.Token) TokeniserOutcome(com.joliciel.talismane.tokeniser.TokeniserOutcome) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) Sentence(com.joliciel.talismane.rawText.Sentence)

Aggregations

TokeniserOutcome (com.joliciel.talismane.tokeniser.TokeniserOutcome)10 Token (com.joliciel.talismane.tokeniser.Token)8 TaggedToken (com.joliciel.talismane.tokeniser.TaggedToken)6 ArrayList (java.util.ArrayList)6 TokenSequence (com.joliciel.talismane.tokeniser.TokenSequence)5 TokenisedAtomicTokenSequence (com.joliciel.talismane.tokeniser.TokenisedAtomicTokenSequence)5 Decision (com.joliciel.talismane.machineLearning.Decision)4 Sentence (com.joliciel.talismane.rawText.Sentence)4 TreeSet (java.util.TreeSet)4 TalismaneException (com.joliciel.talismane.TalismaneException)3 HashMap (java.util.HashMap)3 List (java.util.List)3 Set (java.util.Set)3 TalismaneSession (com.joliciel.talismane.TalismaneSession)2 FeatureResult (com.joliciel.talismane.machineLearning.features.FeatureResult)2 RuntimeEnvironment (com.joliciel.talismane.machineLearning.features.RuntimeEnvironment)2 TokenPatternMatchFeature (com.joliciel.talismane.tokeniser.features.TokenPatternMatchFeature)2 IOException (java.io.IOException)2 HashSet (java.util.HashSet)2 Map (java.util.Map)2