Search in sources :

Example 1 with TokenisedAtomicTokenSequence

use of com.joliciel.talismane.tokeniser.TokenisedAtomicTokenSequence in project talismane by joliciel-informatique.

the class PatternTokeniser method tokeniseInternal.

@Override
protected List<TokenisedAtomicTokenSequence> tokeniseInternal(TokenSequence initialSequence, Sentence sentence) throws TalismaneException, IOException {
    List<TokenisedAtomicTokenSequence> sequences;
    // Assign each separator its default value
    List<TokeniserOutcome> defaultOutcomes = this.tokeniserPatternManager.getDefaultOutcomes(initialSequence);
    List<Decision> defaultDecisions = new ArrayList<Decision>(defaultOutcomes.size());
    for (TokeniserOutcome outcome : defaultOutcomes) {
        Decision tokeniserDecision = new Decision(outcome.name());
        tokeniserDecision.addAuthority("_" + this.getClass().getSimpleName());
        tokeniserDecision.addAuthority("_" + "DefaultDecision");
        defaultDecisions.add(tokeniserDecision);
    }
    // For each test pattern, see if anything in the sentence matches it
    if (this.decisionMaker != null) {
        List<TokenPatternMatchSequence> matchingSequences = new ArrayList<TokenPatternMatchSequence>();
        Map<Token, Set<TokenPatternMatchSequence>> tokenMatchSequenceMap = new HashMap<Token, Set<TokenPatternMatchSequence>>();
        Map<TokenPatternMatchSequence, TokenPatternMatch> primaryMatchMap = new HashMap<TokenPatternMatchSequence, TokenPatternMatch>();
        Set<Token> matchedTokens = new HashSet<Token>();
        for (TokenPattern parsedPattern : this.getTokeniserPatternManager().getParsedTestPatterns()) {
            List<TokenPatternMatchSequence> matchesForThisPattern = parsedPattern.match(initialSequence);
            for (TokenPatternMatchSequence matchSequence : matchesForThisPattern) {
                if (matchSequence.getTokensToCheck().size() > 0) {
                    matchingSequences.add(matchSequence);
                    matchedTokens.addAll(matchSequence.getTokensToCheck());
                    TokenPatternMatch primaryMatch = null;
                    Token token = matchSequence.getTokensToCheck().get(0);
                    Set<TokenPatternMatchSequence> matchSequences = tokenMatchSequenceMap.get(token);
                    if (matchSequences == null) {
                        matchSequences = new TreeSet<TokenPatternMatchSequence>();
                        tokenMatchSequenceMap.put(token, matchSequences);
                    }
                    matchSequences.add(matchSequence);
                    for (TokenPatternMatch patternMatch : matchSequence.getTokenPatternMatches()) {
                        if (patternMatch.getToken().equals(token)) {
                            primaryMatch = patternMatch;
                            break;
                        }
                    }
                    if (LOG.isTraceEnabled()) {
                        LOG.trace("Found match: " + primaryMatch);
                    }
                    primaryMatchMap.put(matchSequence, primaryMatch);
                }
            }
        }
        // we want to create the n most likely token sequences
        // the sequence has to correspond to a token pattern
        Map<TokenPatternMatchSequence, List<Decision>> matchSequenceDecisionMap = new HashMap<TokenPatternMatchSequence, List<Decision>>();
        for (TokenPatternMatchSequence matchSequence : matchingSequences) {
            TokenPatternMatch match = primaryMatchMap.get(matchSequence);
            LOG.debug("next pattern match: " + match.toString());
            List<FeatureResult<?>> tokenFeatureResults = new ArrayList<FeatureResult<?>>();
            for (TokenPatternMatchFeature<?> feature : features) {
                RuntimeEnvironment env = new RuntimeEnvironment();
                FeatureResult<?> featureResult = feature.check(match, env);
                if (featureResult != null) {
                    tokenFeatureResults.add(featureResult);
                }
            }
            if (LOG.isTraceEnabled()) {
                SortedSet<String> featureResultSet = tokenFeatureResults.stream().map(f -> f.toString()).collect(Collectors.toCollection(() -> new TreeSet<String>()));
                for (String featureResultString : featureResultSet) {
                    LOG.trace(featureResultString);
                }
            }
            List<Decision> decisions = this.decisionMaker.decide(tokenFeatureResults);
            for (ClassificationObserver observer : this.observers) observer.onAnalyse(match.getToken(), tokenFeatureResults, decisions);
            for (Decision decision : decisions) {
                decision.addAuthority("_" + this.getClass().getSimpleName());
                decision.addAuthority("_" + "Patterns");
                decision.addAuthority(match.getPattern().getName());
            }
            matchSequenceDecisionMap.put(matchSequence, decisions);
        }
        // initially create a heap with a single, empty sequence
        PriorityQueue<TokenisedAtomicTokenSequence> heap = new PriorityQueue<TokenisedAtomicTokenSequence>();
        TokenisedAtomicTokenSequence emptySequence = new TokenisedAtomicTokenSequence(sentence, 0, this.getSessionId());
        heap.add(emptySequence);
        for (int i = 0; i < initialSequence.listWithWhiteSpace().size(); i++) {
            Token token = initialSequence.listWithWhiteSpace().get(i);
            if (LOG.isTraceEnabled()) {
                LOG.trace("Token : \"" + token.getAnalyisText() + "\"");
            }
            // build a new heap for this iteration
            PriorityQueue<TokenisedAtomicTokenSequence> previousHeap = heap;
            heap = new PriorityQueue<TokenisedAtomicTokenSequence>();
            if (i == 0) {
                // first token is always "separate" from the outside world
                Decision decision = new Decision(TokeniserOutcome.SEPARATE.name());
                decision.addAuthority("_" + this.getClass().getSimpleName());
                decision.addAuthority("_" + "DefaultDecision");
                TaggedToken<TokeniserOutcome> taggedToken = new TaggedToken<>(token, decision, TokeniserOutcome.valueOf(decision.getOutcome()));
                TokenisedAtomicTokenSequence newSequence = new TokenisedAtomicTokenSequence(emptySequence);
                newSequence.add(taggedToken);
                heap.add(newSequence);
                continue;
            }
            // limit the heap breadth to K
            int maxSequences = previousHeap.size() > this.getBeamWidth() ? this.getBeamWidth() : previousHeap.size();
            for (int j = 0; j < maxSequences; j++) {
                TokenisedAtomicTokenSequence history = previousHeap.poll();
                // Find the separating & non-separating decisions
                if (history.size() > i) {
                    // token already added as part of a sequence
                    // introduced by another token
                    heap.add(history);
                } else if (tokenMatchSequenceMap.containsKey(token)) {
                    // token begins one or more match sequences
                    // these are ordered from shortest to longest (via
                    // TreeSet)
                    List<TokenPatternMatchSequence> matchSequences = new ArrayList<TokenPatternMatchSequence>(tokenMatchSequenceMap.get(token));
                    // Since sequences P1..Pn contain each other,
                    // there can be exactly matchSequences.size()
                    // consistent solutions
                    // Assume the default is separate
                    // 0: all separate
                    // 1: join P1, separate rest
                    // 2: join P2, separate rest
                    // ...
                    // n: join Pn
                    // We need to add each of these to the heap
                    // by taking the product of all probabilities
                    // consistent with each solution
                    // The probabities for each solution are (j=join,
                    // s=separate)
                    // All separate: s1 x s2 x ... x sn
                    // P1: j1 x s2 x ... x sn
                    // P2: j1 x j2 x ... x sn
                    // ...
                    // Pn: j1 x j2 x ... x jn
                    // Any solution of the form s1 x j2 would be
                    // inconsistent, and is not considered
                    // If Pi and Pj start and end on the exact same
                    // token, then the solution for both is
                    // Pi: j1 x ... x ji x jj x sj+1 ... x sn
                    // Pj: j1 x ... x ji x jj x sj+1 ... x sn
                    // Note of course that we're never likely to have
                    // more than two Ps here,
                    // but we need a solution for more just to be sure
                    // to be sure
                    TokeniserOutcome defaultOutcome = TokeniserOutcome.valueOf(defaultDecisions.get(token.getIndexWithWhiteSpace()).getOutcome());
                    TokeniserOutcome otherOutcome = null;
                    if (defaultOutcome == TokeniserOutcome.SEPARATE)
                        otherOutcome = TokeniserOutcome.JOIN;
                    else
                        otherOutcome = TokeniserOutcome.SEPARATE;
                    double[] decisionProbs = new double[matchSequences.size() + 1];
                    for (int k = 0; k < decisionProbs.length; k++) decisionProbs[k] = 1;
                    // Note: k0 = default decision (e.g. separate all),
                    // k1=first pattern
                    // p1 = first pattern
                    int p = 1;
                    int prevEndIndex = -1;
                    for (TokenPatternMatchSequence matchSequence : matchSequences) {
                        int endIndex = matchSequence.getTokensToCheck().get(matchSequence.getTokensToCheck().size() - 1).getEndIndex();
                        List<Decision> decisions = matchSequenceDecisionMap.get(matchSequence);
                        for (Decision decision : decisions) {
                            for (int k = 0; k < decisionProbs.length; k++) {
                                if (decision.getOutcome().equals(defaultOutcome.name())) {
                                    // e.g. separate in most cases
                                    if (k < p && endIndex > prevEndIndex)
                                        decisionProbs[k] *= decision.getProbability();
                                    else if (k + 1 < p && endIndex <= prevEndIndex)
                                        decisionProbs[k] *= decision.getProbability();
                                } else {
                                    // e.g. join in most cases
                                    if (k >= p && endIndex > prevEndIndex)
                                        decisionProbs[k] *= decision.getProbability();
                                    else if (k + 1 >= p && endIndex <= prevEndIndex)
                                        decisionProbs[k] *= decision.getProbability();
                                }
                            }
                        // next k
                        }
                        // next decision (only 2 of these)
                        prevEndIndex = endIndex;
                        p++;
                    }
                    // transform to probability distribution
                    double sumProbs = 0;
                    for (int k = 0; k < decisionProbs.length; k++) sumProbs += decisionProbs[k];
                    if (sumProbs > 0)
                        for (int k = 0; k < decisionProbs.length; k++) decisionProbs[k] /= sumProbs;
                    // Apply default decision
                    // Since this is the default decision for all tokens
                    // in the sequence, we don't add the other tokens
                    // for now,
                    // so as to allow them
                    // to get examined one at a time, just in case one
                    // of them starts its own separate sequence
                    Decision defaultDecision = new Decision(defaultOutcome.name(), decisionProbs[0]);
                    defaultDecision.addAuthority("_" + this.getClass().getSimpleName());
                    defaultDecision.addAuthority("_" + "Patterns");
                    for (TokenPatternMatchSequence matchSequence : matchSequences) {
                        defaultDecision.addAuthority(matchSequence.getTokenPattern().getName());
                    }
                    TaggedToken<TokeniserOutcome> defaultTaggedToken = new TaggedToken<>(token, defaultDecision, TokeniserOutcome.valueOf(defaultDecision.getOutcome()));
                    TokenisedAtomicTokenSequence defaultSequence = new TokenisedAtomicTokenSequence(history);
                    defaultSequence.add(defaultTaggedToken);
                    defaultSequence.addDecision(defaultDecision);
                    heap.add(defaultSequence);
                    // Apply one non-default decision per match sequence
                    for (int k = 0; k < matchSequences.size(); k++) {
                        TokenPatternMatchSequence matchSequence = matchSequences.get(k);
                        double prob = decisionProbs[k + 1];
                        Decision decision = new Decision(otherOutcome.name(), prob);
                        decision.addAuthority("_" + this.getClass().getSimpleName());
                        decision.addAuthority("_" + "Patterns");
                        decision.addAuthority(matchSequence.getTokenPattern().getName());
                        TaggedToken<TokeniserOutcome> taggedToken = new TaggedToken<>(token, decision, TokeniserOutcome.valueOf(decision.getOutcome()));
                        TokenisedAtomicTokenSequence newSequence = new TokenisedAtomicTokenSequence(history);
                        newSequence.add(taggedToken);
                        newSequence.addDecision(decision);
                        // in this sequence to the solution
                        for (Token tokenInSequence : matchSequence.getTokensToCheck()) {
                            if (tokenInSequence.equals(token)) {
                                continue;
                            }
                            Decision decisionInSequence = new Decision(decision.getOutcome());
                            decisionInSequence.addAuthority("_" + this.getClass().getSimpleName());
                            decisionInSequence.addAuthority("_" + "DecisionInSequence");
                            decisionInSequence.addAuthority("_" + "DecisionInSequence_non_default");
                            decisionInSequence.addAuthority("_" + "Patterns");
                            TaggedToken<TokeniserOutcome> taggedTokenInSequence = new TaggedToken<>(tokenInSequence, decisionInSequence, TokeniserOutcome.valueOf(decisionInSequence.getOutcome()));
                            newSequence.add(taggedTokenInSequence);
                        }
                        heap.add(newSequence);
                    }
                // next sequence
                } else {
                    // token doesn't start match sequence, and hasn't
                    // already been added to the current sequence
                    Decision decision = defaultDecisions.get(i);
                    if (matchedTokens.contains(token)) {
                        decision = new Decision(decision.getOutcome());
                        decision.addAuthority("_" + this.getClass().getSimpleName());
                        decision.addAuthority("_" + "DecisionInSequence");
                        decision.addAuthority("_" + "DecisionInSequence_default");
                        decision.addAuthority("_" + "Patterns");
                    }
                    TaggedToken<TokeniserOutcome> taggedToken = new TaggedToken<>(token, decision, TokeniserOutcome.valueOf(decision.getOutcome()));
                    TokenisedAtomicTokenSequence newSequence = new TokenisedAtomicTokenSequence(history);
                    newSequence.add(taggedToken);
                    heap.add(newSequence);
                }
            }
        // next sequence in the old heap
        }
        // next token
        sequences = new ArrayList<TokenisedAtomicTokenSequence>();
        int k = 0;
        while (!heap.isEmpty()) {
            sequences.add(heap.poll());
            k++;
            if (k >= this.getBeamWidth())
                break;
        }
    } else {
        sequences = new ArrayList<TokenisedAtomicTokenSequence>();
        TokenisedAtomicTokenSequence defaultSequence = new TokenisedAtomicTokenSequence(sentence, 0, this.getSessionId());
        int i = 0;
        for (Token token : initialSequence.listWithWhiteSpace()) {
            Decision decision = defaultDecisions.get(i++);
            TaggedToken<TokeniserOutcome> taggedToken = new TaggedToken<>(token, decision, TokeniserOutcome.valueOf(decision.getOutcome()));
            defaultSequence.add(taggedToken);
        }
        sequences.add(defaultSequence);
    }
    // have decision maker?
    return sequences;
}
Also used : ClassificationObserver(com.joliciel.talismane.machineLearning.ClassificationObserver) ZipInputStream(java.util.zip.ZipInputStream) SortedSet(java.util.SortedSet) PriorityQueue(java.util.PriorityQueue) LoggerFactory(org.slf4j.LoggerFactory) TokenisedAtomicTokenSequence(com.joliciel.talismane.tokeniser.TokenisedAtomicTokenSequence) HashMap(java.util.HashMap) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) MachineLearningModelFactory(com.joliciel.talismane.machineLearning.MachineLearningModelFactory) TaggedToken(com.joliciel.talismane.tokeniser.TaggedToken) TreeSet(java.util.TreeSet) TalismaneException(com.joliciel.talismane.TalismaneException) TalismaneSession(com.joliciel.talismane.TalismaneSession) ArrayList(java.util.ArrayList) ClassificationModel(com.joliciel.talismane.machineLearning.ClassificationModel) HashSet(java.util.HashSet) RuntimeEnvironment(com.joliciel.talismane.machineLearning.features.RuntimeEnvironment) TokenPatternMatchFeatureParser(com.joliciel.talismane.tokeniser.features.TokenPatternMatchFeatureParser) TokenPatternMatchFeature(com.joliciel.talismane.tokeniser.features.TokenPatternMatchFeature) FeatureResult(com.joliciel.talismane.machineLearning.features.FeatureResult) Map(java.util.Map) ConfigUtils(com.joliciel.talismane.utils.ConfigUtils) ConfigFactory(com.typesafe.config.ConfigFactory) ExternalResource(com.joliciel.talismane.machineLearning.ExternalResource) DecisionMaker(com.joliciel.talismane.machineLearning.DecisionMaker) Tokeniser(com.joliciel.talismane.tokeniser.Tokeniser) Logger(org.slf4j.Logger) Config(com.typesafe.config.Config) Collection(java.util.Collection) Set(java.util.Set) IOException(java.io.IOException) TokeniserOutcome(com.joliciel.talismane.tokeniser.TokeniserOutcome) Decision(com.joliciel.talismane.machineLearning.Decision) Collectors(java.util.stream.Collectors) File(java.io.File) List(java.util.List) Token(com.joliciel.talismane.tokeniser.Token) Sentence(com.joliciel.talismane.rawText.Sentence) InputStream(java.io.InputStream) SortedSet(java.util.SortedSet) TreeSet(java.util.TreeSet) HashSet(java.util.HashSet) Set(java.util.Set) TaggedToken(com.joliciel.talismane.tokeniser.TaggedToken) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) TaggedToken(com.joliciel.talismane.tokeniser.TaggedToken) Token(com.joliciel.talismane.tokeniser.Token) TokeniserOutcome(com.joliciel.talismane.tokeniser.TokeniserOutcome) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) List(java.util.List) TokenisedAtomicTokenSequence(com.joliciel.talismane.tokeniser.TokenisedAtomicTokenSequence) HashSet(java.util.HashSet) RuntimeEnvironment(com.joliciel.talismane.machineLearning.features.RuntimeEnvironment) PriorityQueue(java.util.PriorityQueue) Decision(com.joliciel.talismane.machineLearning.Decision) ClassificationObserver(com.joliciel.talismane.machineLearning.ClassificationObserver) FeatureResult(com.joliciel.talismane.machineLearning.features.FeatureResult)

Example 2 with TokenisedAtomicTokenSequence

use of com.joliciel.talismane.tokeniser.TokenisedAtomicTokenSequence in project talismane by joliciel-informatique.

the class PatternTokeniser method applyDecision.

TokenisedAtomicTokenSequence applyDecision(Token token, Decision decision, TokenisedAtomicTokenSequence history, TokenPatternMatchSequence matchSequence, Decision defaultDecision) {
    TaggedToken<TokeniserOutcome> taggedToken = new TaggedToken<>(token, decision, TokeniserOutcome.valueOf(decision.getOutcome()));
    TokenisedAtomicTokenSequence tokenisedSequence = new TokenisedAtomicTokenSequence(history);
    tokenisedSequence.add(taggedToken);
    if (decision.isStatistical())
        tokenisedSequence.addDecision(decision);
    if (matchSequence != null) {
        for (Token otherToken : matchSequence.getTokensToCheck()) {
            if (otherToken.equals(token)) {
                continue;
            }
            TaggedToken<TokeniserOutcome> anotherTaggedToken = new TaggedToken<>(otherToken, decision, TokeniserOutcome.valueOf(decision.getOutcome()));
            tokenisedSequence.add(anotherTaggedToken);
        }
    }
    return tokenisedSequence;
}
Also used : TaggedToken(com.joliciel.talismane.tokeniser.TaggedToken) TaggedToken(com.joliciel.talismane.tokeniser.TaggedToken) Token(com.joliciel.talismane.tokeniser.Token) TokeniserOutcome(com.joliciel.talismane.tokeniser.TokeniserOutcome) TokenisedAtomicTokenSequence(com.joliciel.talismane.tokeniser.TokenisedAtomicTokenSequence)

Example 3 with TokenisedAtomicTokenSequence

use of com.joliciel.talismane.tokeniser.TokenisedAtomicTokenSequence in project talismane by joliciel-informatique.

the class TokenEvaluationCorpusWriter method onNextTokenSequence.

@Override
public void onNextTokenSequence(TokenSequence realSequence, List<TokenisedAtomicTokenSequence> guessedAtomicSequences) throws IOException {
    List<Integer> realSplits = realSequence.getTokenSplits();
    TokenisedAtomicTokenSequence tokenisedAtomicTokenSequence = guessedAtomicSequences.get(0);
    Map<Integer, TokeniserOutcome> realOutcomes = new HashMap<Integer, TokeniserOutcome>();
    Map<Integer, TokeniserOutcome> guessedOutcomes = new HashMap<Integer, TokeniserOutcome>();
    Map<Integer, List<String>> guessedAuthorities = new HashMap<Integer, List<String>>();
    List<Integer> indexes = new ArrayList<Integer>();
    corpusWriter.write(realSequence.getSentence().getText() + "\n");
    for (TaggedToken<TokeniserOutcome> guessTag : tokenisedAtomicTokenSequence) {
        TokeniserOutcome guessDecision = guessTag.getTag();
        int startIndex = guessTag.getToken().getStartIndex();
        boolean realSplit = realSplits.contains(startIndex);
        TokeniserOutcome realDecision = realSplit ? TokeniserOutcome.SEPARATE : TokeniserOutcome.JOIN;
        indexes.add(startIndex);
        realOutcomes.put(startIndex, realDecision);
        guessedOutcomes.put(startIndex, guessDecision);
        guessedAuthorities.put(startIndex, guessTag.getDecision().getAuthorities());
    }
    int prevEndIndex = 0;
    for (Token token : realSequence) {
        corpusWriter.write(token.getOriginalText());
        Set<String> authorities = new TreeSet<String>();
        boolean correct = true;
        for (int index : indexes) {
            if (prevEndIndex <= index && index < token.getEndIndex()) {
                correct = correct && realOutcomes.get(index) == guessedOutcomes.get(index);
                authorities.addAll(guessedAuthorities.get(index));
            }
        }
        corpusWriter.write("\t" + correct);
        for (String authority : authorities) {
            if (!authority.startsWith("_")) {
                corpusWriter.write("\t" + authority);
            }
        }
        corpusWriter.write("\n");
        corpusWriter.flush();
        prevEndIndex = token.getEndIndex();
    }
    corpusWriter.write("\n");
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) TaggedToken(com.joliciel.talismane.tokeniser.TaggedToken) Token(com.joliciel.talismane.tokeniser.Token) TokeniserOutcome(com.joliciel.talismane.tokeniser.TokeniserOutcome) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) List(java.util.List) TokenisedAtomicTokenSequence(com.joliciel.talismane.tokeniser.TokenisedAtomicTokenSequence)

Example 4 with TokenisedAtomicTokenSequence

use of com.joliciel.talismane.tokeniser.TokenisedAtomicTokenSequence in project talismane by joliciel-informatique.

the class TokenFScoreCalculator method onNextTokenSequence.

@Override
public void onNextTokenSequence(TokenSequence realSequence, List<TokenisedAtomicTokenSequence> guessedAtomicSequences) {
    List<Integer> realSplits = realSequence.getTokenSplits();
    String sentence = realSequence.getSentence().getText().toString();
    TokenisedAtomicTokenSequence tokeniserAtomicTokenSequence = guessedAtomicSequences.get(0);
    TokenSequence guessedSequence = tokeniserAtomicTokenSequence.inferTokenSequence();
    List<Integer> guessedSplits = guessedSequence.getTokenSplits();
    if (LOG.isDebugEnabled()) {
        int pos = 0;
        StringBuilder sb = new StringBuilder();
        for (int split : realSplits) {
            String aToken = sentence.substring(pos, split);
            sb.append('|');
            sb.append(aToken);
            pos = split;
        }
        int pos2 = 0;
        StringBuilder sb2 = new StringBuilder();
        for (int split : guessedSplits) {
            String aToken = sentence.substring(pos2, split);
            sb2.append('|');
            sb2.append(aToken);
            pos2 = split;
        }
        LOG.debug("Real:    " + sb.toString());
        LOG.debug("Guessed: " + sb2.toString());
    }
    for (TaggedToken<TokeniserOutcome> guessTag : tokeniserAtomicTokenSequence) {
        TokeniserOutcome guessDecision = guessTag.getTag();
        boolean realSplit = realSplits.contains(guessTag.getToken().getStartIndex());
        TokeniserOutcome realDecision = realSplit ? TokeniserOutcome.SEPARATE : TokeniserOutcome.JOIN;
        if (!realDecision.equals(guessDecision)) {
            int start1 = guessTag.getToken().getStartIndex() - NUM_CHARS;
            int end1 = guessTag.getToken().getStartIndex() + NUM_CHARS;
            if (start1 < 0)
                start1 = 0;
            String startString = sentence.substring(start1, guessTag.getToken().getStartIndex());
            startString = StringUtils.padLeft(startString, NUM_CHARS);
            if (end1 >= sentence.length())
                end1 = sentence.length() - 1;
            String symbol = "+";
            if (realDecision == TokeniserOutcome.SEPARATE)
                symbol = "-";
            TokeniserErrorRecord errorRecord = new TokeniserErrorRecord();
            errorRecord.realDecision = realDecision;
            errorRecord.guessDecision = guessDecision;
            errorRecord.context = startString + "[" + symbol + "]" + sentence.substring(guessTag.getToken().getStartIndex(), end1);
            LOG.debug("guess " + guessDecision + ", real " + realDecision + ", context: " + errorRecord.context);
            for (String authority : guessTag.getDecision().getAuthorities()) {
                List<TokeniserErrorRecord> errors = errorMap.get(authority);
                if (errors == null) {
                    errors = new ArrayList<TokeniserErrorRecord>();
                    errorMap.put(authority, errors);
                }
                errors.add(errorRecord);
            }
        }
        fScoreCalculator.increment(realDecision, guessDecision);
        for (String authority : guessTag.getDecision().getAuthorities()) {
            FScoreCalculator<TokeniserOutcome> taggerFScoreCalculator = taggerFScoreCalculators.get(authority);
            if (taggerFScoreCalculator == null) {
                taggerFScoreCalculator = new FScoreCalculator<TokeniserOutcome>();
                taggerFScoreCalculators.put(authority, taggerFScoreCalculator);
            }
            taggerFScoreCalculator.increment(realDecision, guessDecision);
        }
    }
// next decision
}
Also used : TokeniserOutcome(com.joliciel.talismane.tokeniser.TokeniserOutcome) TokenisedAtomicTokenSequence(com.joliciel.talismane.tokeniser.TokenisedAtomicTokenSequence) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) TokenisedAtomicTokenSequence(com.joliciel.talismane.tokeniser.TokenisedAtomicTokenSequence)

Example 5 with TokenisedAtomicTokenSequence

use of com.joliciel.talismane.tokeniser.TokenisedAtomicTokenSequence in project talismane by joliciel-informatique.

the class TokeniserEvaluator method evaluate.

/**
 * Evaluate a given tokeniser.
 *
 * @throws TalismaneException
 * @throws IOException
 */
public void evaluate() throws TalismaneException, IOException {
    while (corpusReader.hasNextSentence()) {
        TokenSequence realSequence = corpusReader.nextTokenSequence();
        Sentence sentence = realSequence.getSentence();
        List<TokenisedAtomicTokenSequence> guessedAtomicSequences = tokeniser.tokeniseWithDecisions(sentence);
        for (TokenEvaluationObserver observer : observers) {
            observer.onNextTokenSequence(realSequence, guessedAtomicSequences);
        }
    }
    for (TokenEvaluationObserver observer : observers) {
        observer.onEvaluationComplete();
    }
}
Also used : TokenisedAtomicTokenSequence(com.joliciel.talismane.tokeniser.TokenisedAtomicTokenSequence) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) Sentence(com.joliciel.talismane.rawText.Sentence) TokenisedAtomicTokenSequence(com.joliciel.talismane.tokeniser.TokenisedAtomicTokenSequence)

Aggregations

TokenisedAtomicTokenSequence (com.joliciel.talismane.tokeniser.TokenisedAtomicTokenSequence)6 TokeniserOutcome (com.joliciel.talismane.tokeniser.TokeniserOutcome)5 Token (com.joliciel.talismane.tokeniser.Token)4 TokenSequence (com.joliciel.talismane.tokeniser.TokenSequence)4 Sentence (com.joliciel.talismane.rawText.Sentence)3 TaggedToken (com.joliciel.talismane.tokeniser.TaggedToken)3 ArrayList (java.util.ArrayList)3 HashMap (java.util.HashMap)3 TreeSet (java.util.TreeSet)3 TalismaneException (com.joliciel.talismane.TalismaneException)2 Decision (com.joliciel.talismane.machineLearning.Decision)2 HashSet (java.util.HashSet)2 List (java.util.List)2 Set (java.util.Set)2 TalismaneSession (com.joliciel.talismane.TalismaneSession)1 ClassificationModel (com.joliciel.talismane.machineLearning.ClassificationModel)1 ClassificationObserver (com.joliciel.talismane.machineLearning.ClassificationObserver)1 DecisionMaker (com.joliciel.talismane.machineLearning.DecisionMaker)1 ExternalResource (com.joliciel.talismane.machineLearning.ExternalResource)1 MachineLearningModelFactory (com.joliciel.talismane.machineLearning.MachineLearningModelFactory)1