Search in sources :

Example 16 with TalismaneException

use of com.joliciel.talismane.TalismaneException in project talismane by joliciel-informatique.

the class ParserEvaluator method evaluate.

/**
 * @throws TalismaneException
 *           if an attempt is made to evaluate with a tokeniser but no
 *           pos-tagger
 * @throws IOException
 */
public void evaluate() throws TalismaneException, IOException {
    while (corpusReader.hasNextSentence()) {
        ParseConfiguration realConfiguration = corpusReader.nextConfiguration();
        List<PosTagSequence> posTagSequences = null;
        List<TokenSequence> tokenSequences = null;
        if (tokeniser != null) {
            if (posTagger == null)
                throw new TalismaneException("Cannot evaluate with tokeniser but no pos-tagger");
            Sentence sentence = realConfiguration.getPosTagSequence().getTokenSequence().getSentence();
            // annotate the sentence for pre token filters
            for (SentenceAnnotator annotator : TalismaneSession.get(sessionId).getSentenceAnnotators()) {
                annotator.annotate(sentence);
                if (LOG.isTraceEnabled()) {
                    LOG.trace("TokenFilter: " + annotator);
                    LOG.trace("annotations: " + sentence.getAnnotations());
                }
            }
            tokenSequences = tokeniser.tokenise(sentence);
        } else {
            tokenSequences = new ArrayList<TokenSequence>();
            PosTagSequence posTagSequence = realConfiguration.getPosTagSequence().clonePosTagSequence();
            posTagSequence.removeRoot();
            tokenSequences.add(posTagSequence.getTokenSequence());
        }
        if (posTagger != null) {
            if (posTagger instanceof NonDeterministicPosTagger) {
                NonDeterministicPosTagger nonDeterministicPosTagger = (NonDeterministicPosTagger) posTagger;
                posTagSequences = nonDeterministicPosTagger.tagSentence(tokenSequences);
            } else {
                posTagSequences = new ArrayList<PosTagSequence>();
                PosTagSequence posTagSequence = null;
                posTagSequence = posTagger.tagSentence(tokenSequences.get(0));
                posTagSequences.add(posTagSequence);
            }
        } else {
            PosTagSequence posTagSequence = realConfiguration.getPosTagSequence();
            posTagSequences = new ArrayList<PosTagSequence>();
            posTagSequences.add(posTagSequence);
        }
        for (ParseEvaluationObserver observer : this.observers) {
            observer.onParseStart(realConfiguration, posTagSequences);
        }
        List<ParseConfiguration> guessedConfigurations = null;
        if (parser instanceof NonDeterministicParser) {
            NonDeterministicParser nonDeterministicParser = (NonDeterministicParser) parser;
            guessedConfigurations = nonDeterministicParser.parseSentence(posTagSequences);
        } else {
            ParseConfiguration bestGuess = parser.parseSentence(posTagSequences.get(0));
            guessedConfigurations = new ArrayList<ParseConfiguration>();
            guessedConfigurations.add(bestGuess);
        }
        for (ParseEvaluationObserver observer : this.observers) {
            observer.onParseEnd(realConfiguration, guessedConfigurations);
        }
    }
    for (ParseEvaluationObserver observer : this.observers) {
        observer.onEvaluationComplete();
    }
}
Also used : TalismaneException(com.joliciel.talismane.TalismaneException) NonDeterministicParser(com.joliciel.talismane.parser.NonDeterministicParser) ParseConfiguration(com.joliciel.talismane.parser.ParseConfiguration) NonDeterministicPosTagger(com.joliciel.talismane.posTagger.NonDeterministicPosTagger) SentenceAnnotator(com.joliciel.talismane.sentenceAnnotators.SentenceAnnotator) PosTagSequence(com.joliciel.talismane.posTagger.PosTagSequence) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) Sentence(com.joliciel.talismane.rawText.Sentence)

Example 17 with TalismaneException

use of com.joliciel.talismane.TalismaneException in project talismane by joliciel-informatique.

the class ParserFScoreCalculator method onParseEnd.

@Override
public void onParseEnd(ParseConfiguration realConfiguration, List<ParseConfiguration> guessedConfigurations) throws TalismaneException {
    PosTagSequence posTagSequence = realConfiguration.getPosTagSequence();
    ParseConfiguration bestGuess = guessedConfigurations.get(0);
    int mismatchedTokens = 0;
    for (PosTaggedToken posTaggedToken : posTagSequence) {
        if (!posTaggedToken.getTag().equals(PosTag.ROOT_POS_TAG)) {
            DependencyArc realArc = realConfiguration.getGoverningDependency(posTaggedToken, projective);
            DependencyArc guessedArc = null;
            boolean foundToken = false;
            for (PosTaggedToken guessedToken : bestGuess.getPosTagSequence()) {
                if (guessedToken.getToken().getStartIndex() == posTaggedToken.getToken().getStartIndex()) {
                    if (guessedToken.getToken().isEmpty() && !posTaggedToken.getToken().isEmpty())
                        continue;
                    if (!guessedToken.getToken().isEmpty() && posTaggedToken.getToken().isEmpty())
                        continue;
                    foundToken = true;
                    guessedArc = bestGuess.getGoverningDependency(guessedToken, projective);
                    break;
                }
            }
            if (!foundToken) {
                LOG.info("Mismatched token :" + posTaggedToken.getToken().getOriginalText() + ", index " + posTaggedToken.getToken().getIndex());
                mismatchedTokens += 1;
            }
            String realLabel = realArc == null ? "noHead" : labeledEvaluation ? realArc.getLabel() : "head";
            String guessedLabel = guessedArc == null ? "noHead" : labeledEvaluation ? guessedArc.getLabel() : "head";
            if (realLabel == null || realLabel.length() == 0)
                realLabel = "noLabel";
            if (guessedLabel == null || guessedLabel.length() == 0)
                guessedLabel = "noLabel";
            // should be considered a "no head" rather than "no label"
            if (realArc != null && realArc.getHead().getTag().equals(PosTag.ROOT_POS_TAG) && realLabel.equals("noLabel"))
                realLabel = "noHead";
            if (guessedArc != null && guessedArc.getHead().getTag().equals(PosTag.ROOT_POS_TAG) && guessedLabel.equals("noLabel"))
                guessedLabel = "noHead";
            if (realArc == null || guessedArc == null) {
                fscoreCalculator.increment(realLabel, guessedLabel);
            } else {
                boolean sameHead = realArc.getHead().getToken().getStartIndex() == guessedArc.getHead().getToken().getStartIndex();
                if (sameHead) {
                    fscoreCalculator.increment(realLabel, guessedLabel);
                } else if (guessedLabel.equals("noHead")) {
                    fscoreCalculator.increment(realLabel, "noHead");
                } else if (realArc.getLabel().equals(guessedArc.getLabel())) {
                    fscoreCalculator.increment(realLabel, "wrongHead");
                } else {
                    fscoreCalculator.increment(realLabel, "wrongHeadWrongLabel");
                }
            }
        // have one of the arcs
        }
    // is root tag?
    }
    if ((double) mismatchedTokens / (double) posTagSequence.size() > 0.5) {
        // more than half of the tokens mismatched?
        throw new TalismaneException("Too many mismatched tokens in sentence: " + posTagSequence.getTokenSequence().getSentence().getText());
    }
}
Also used : PosTaggedToken(com.joliciel.talismane.posTagger.PosTaggedToken) TalismaneException(com.joliciel.talismane.TalismaneException) PosTagSequence(com.joliciel.talismane.posTagger.PosTagSequence) DependencyArc(com.joliciel.talismane.parser.DependencyArc) ParseConfiguration(com.joliciel.talismane.parser.ParseConfiguration)

Example 18 with TalismaneException

use of com.joliciel.talismane.TalismaneException in project talismane by joliciel-informatique.

the class PosTaggers method getPosTagger.

public static PosTagger getPosTagger(String sessionId) throws ReflectiveOperationException {
    PosTagger posTagger = posTaggerMap.get(sessionId);
    if (posTagger == null) {
        Config config = ConfigFactory.load();
        Config posTaggerConfig = config.getConfig("talismane.core." + sessionId + ".pos-tagger");
        String className = posTaggerConfig.getString("pos-tagger");
        @SuppressWarnings("rawtypes") Class untypedClass = Class.forName(className);
        if (!PosTagger.class.isAssignableFrom(untypedClass))
            throw new TalismaneException("Class " + className + " does not implement interface " + PosTagger.class.getSimpleName());
        @SuppressWarnings("unchecked") Class<? extends PosTagger> clazz = untypedClass;
        Constructor<? extends PosTagger> cons = null;
        if (cons == null) {
            try {
                cons = clazz.getConstructor(String.class);
            } catch (NoSuchMethodException e) {
            // do nothing
            }
            if (cons != null) {
                posTagger = cons.newInstance(sessionId);
            } else {
                throw new TalismaneException("No constructor found with correct signature for: " + className);
            }
        }
        posTaggerMap.put(sessionId, posTagger);
    }
    return posTagger.clonePosTagger();
}
Also used : TalismaneException(com.joliciel.talismane.TalismaneException) Config(com.typesafe.config.Config)

Example 19 with TalismaneException

use of com.joliciel.talismane.TalismaneException in project talismane by joliciel-informatique.

the class ForwardStatisticalPosTagger method tagSentence.

@Override
public List<PosTagSequence> tagSentence(List<TokenSequence> input) throws TalismaneException, IOException {
    List<TokenSequence> tokenSequences = null;
    if (this.propagateTokeniserBeam) {
        tokenSequences = input;
    } else {
        tokenSequences = new ArrayList<>(1);
        tokenSequences.add(input.get(0));
    }
    int sentenceLength = tokenSequences.get(0).getSentence().getText().length();
    TreeMap<Double, PriorityQueue<PosTagSequence>> heaps = new TreeMap<Double, PriorityQueue<PosTagSequence>>();
    PriorityQueue<PosTagSequence> heap0 = new PriorityQueue<PosTagSequence>();
    for (TokenSequence tokenSequence : tokenSequences) {
        // add an empty PosTagSequence for each token sequence
        PosTagSequence emptySequence = new PosTagSequence(tokenSequence);
        emptySequence.setScoringStrategy(decisionMaker.getDefaultScoringStrategy());
        heap0.add(emptySequence);
    }
    heaps.put(0.0, heap0);
    PriorityQueue<PosTagSequence> finalHeap = null;
    while (heaps.size() > 0) {
        Entry<Double, PriorityQueue<PosTagSequence>> heapEntry = heaps.pollFirstEntry();
        if (LOG.isTraceEnabled()) {
            LOG.trace("heap key: " + heapEntry.getKey() + ", sentence length: " + sentenceLength);
        }
        if (heapEntry.getKey() == sentenceLength) {
            finalHeap = heapEntry.getValue();
            break;
        }
        PriorityQueue<PosTagSequence> previousHeap = heapEntry.getValue();
        // limit the breadth to K
        int maxSequences = previousHeap.size() > this.beamWidth ? this.beamWidth : previousHeap.size();
        for (int j = 0; j < maxSequences; j++) {
            PosTagSequence history = previousHeap.poll();
            Token token = history.getNextToken();
            if (LOG.isTraceEnabled()) {
                LOG.trace("#### Next history ( " + heapEntry.getKey() + "): " + history.toString());
                LOG.trace("Prob: " + df.format(history.getScore()));
                LOG.trace("Token: " + token.getText());
                StringBuilder sb = new StringBuilder();
                for (Token oneToken : history.getTokenSequence().listWithWhiteSpace()) {
                    if (oneToken.equals(token))
                        sb.append("[" + oneToken + "]");
                    else
                        sb.append(oneToken);
                }
                LOG.trace(sb.toString());
            }
            PosTaggerContext context = new PosTaggerContextImpl(token, history);
            List<Decision> decisions = new ArrayList<Decision>();
            boolean ruleApplied = false;
            // assigned?
            if (token.getAttributes().containsKey(PosTagger.POS_TAG_ATTRIBUTE)) {
                StringAttribute posTagCodeAttribute = (StringAttribute) token.getAttributes().get(PosTagger.POS_TAG_ATTRIBUTE);
                String posTagCode = posTagCodeAttribute.getValue();
                Decision positiveRuleDecision = new Decision(posTagCode);
                decisions.add(positiveRuleDecision);
                positiveRuleDecision.addAuthority("tokenAttribute");
                ruleApplied = true;
                if (LOG.isTraceEnabled()) {
                    LOG.trace("Token has attribute \"" + PosTagger.POS_TAG_ATTRIBUTE + "\". Setting posTag to: " + posTagCode);
                }
            }
            // test the positive rules on the current token
            if (!ruleApplied) {
                if (posTaggerPositiveRules != null) {
                    for (PosTaggerRule rule : posTaggerPositiveRules) {
                        if (LOG.isTraceEnabled()) {
                            LOG.trace("Checking rule: " + rule.getCondition().getName());
                        }
                        RuntimeEnvironment env = new RuntimeEnvironment();
                        FeatureResult<Boolean> ruleResult = rule.getCondition().check(context, env);
                        if (ruleResult != null && ruleResult.getOutcome()) {
                            Decision positiveRuleDecision = new Decision(rule.getTag().getCode());
                            decisions.add(positiveRuleDecision);
                            positiveRuleDecision.addAuthority(rule.getCondition().getName());
                            ruleApplied = true;
                            if (LOG.isTraceEnabled()) {
                                LOG.trace("Rule applies. Setting posTag to: " + rule.getTag().getCode());
                            }
                            break;
                        }
                    }
                }
            }
            if (!ruleApplied) {
                // test the features on the current token
                List<FeatureResult<?>> featureResults = new ArrayList<FeatureResult<?>>();
                for (PosTaggerFeature<?> posTaggerFeature : posTaggerFeatures) {
                    RuntimeEnvironment env = new RuntimeEnvironment();
                    FeatureResult<?> featureResult = posTaggerFeature.check(context, env);
                    if (featureResult != null)
                        featureResults.add(featureResult);
                }
                if (LOG.isTraceEnabled()) {
                    SortedSet<String> featureResultSet = featureResults.stream().map(f -> f.toString()).collect(Collectors.toCollection(() -> new TreeSet<String>()));
                    for (String featureResultString : featureResultSet) {
                        LOG.trace(featureResultString);
                    }
                }
                // evaluate the feature results using the maxent model
                decisions = this.decisionMaker.decide(featureResults);
                for (ClassificationObserver observer : this.observers) {
                    observer.onAnalyse(token, featureResults, decisions);
                }
                // apply the negative rules
                Set<String> eliminatedPosTags = new TreeSet<String>();
                if (posTaggerNegativeRules != null) {
                    for (PosTaggerRule rule : posTaggerNegativeRules) {
                        if (LOG.isTraceEnabled()) {
                            LOG.trace("Checking negative rule: " + rule.getCondition().getName());
                        }
                        RuntimeEnvironment env = new RuntimeEnvironment();
                        FeatureResult<Boolean> ruleResult = rule.getCondition().check(context, env);
                        if (ruleResult != null && ruleResult.getOutcome()) {
                            eliminatedPosTags.add(rule.getTag().getCode());
                            if (LOG.isTraceEnabled()) {
                                LOG.trace("Rule applies. Eliminating posTag: " + rule.getTag().getCode());
                            }
                        }
                    }
                    if (eliminatedPosTags.size() > 0) {
                        List<Decision> decisionShortList = new ArrayList<Decision>();
                        for (Decision decision : decisions) {
                            if (!eliminatedPosTags.contains(decision.getOutcome())) {
                                decisionShortList.add(decision);
                            } else {
                                LOG.trace("Eliminating decision: " + decision.toString());
                            }
                        }
                        if (decisionShortList.size() > 0) {
                            decisions = decisionShortList;
                        } else {
                            LOG.debug("All decisions eliminated! Restoring original decisions.");
                        }
                    }
                }
                // is this a known word in the lexicon?
                if (LOG.isTraceEnabled()) {
                    String posTags = "";
                    for (PosTag onePosTag : token.getPossiblePosTags()) {
                        posTags += onePosTag.getCode() + ",";
                    }
                    LOG.trace("Token: " + token.getText() + ". PosTags: " + posTags);
                }
                List<Decision> decisionShortList = new ArrayList<Decision>();
                for (Decision decision : decisions) {
                    if (decision.getProbability() >= MIN_PROB_TO_STORE) {
                        decisionShortList.add(decision);
                    }
                }
                if (decisionShortList.size() > 0) {
                    decisions = decisionShortList;
                }
            }
            // outcome provided by MaxEnt
            for (Decision decision : decisions) {
                if (LOG.isTraceEnabled())
                    LOG.trace("Outcome: " + decision.getOutcome() + ", " + decision.getProbability());
                PosTaggedToken posTaggedToken = new PosTaggedToken(token, decision, this.sessionId);
                PosTagSequence sequence = new PosTagSequence(history);
                sequence.addPosTaggedToken(posTaggedToken);
                if (decision.isStatistical())
                    sequence.addDecision(decision);
                double heapIndex = token.getEndIndex();
                // it from regular ones
                if (token.getStartIndex() == token.getEndIndex())
                    heapIndex += 0.5;
                // if it's the last token, make sure we end
                if (token.getIndex() == sequence.getTokenSequence().size() - 1)
                    heapIndex = sentenceLength;
                if (LOG.isTraceEnabled())
                    LOG.trace("Heap index: " + heapIndex);
                PriorityQueue<PosTagSequence> heap = heaps.get(heapIndex);
                if (heap == null) {
                    heap = new PriorityQueue<PosTagSequence>();
                    heaps.put(heapIndex, heap);
                }
                heap.add(sequence);
            }
        // next outcome for this token
        }
    // next history
    }
    // next atomic index
    // return the best sequence on the heap
    List<PosTagSequence> sequences = new ArrayList<PosTagSequence>();
    int i = 0;
    while (!finalHeap.isEmpty()) {
        // clone the pos tag sequences to ensure they don't share any underlying
        // data (e.g. token sequences)
        sequences.add(finalHeap.poll().clonePosTagSequence());
        i++;
        if (i >= this.getBeamWidth())
            break;
    }
    // apply post-processing filters
    if (LOG.isDebugEnabled()) {
        LOG.debug("####Final postag sequences:");
        int j = 1;
        for (PosTagSequence sequence : sequences) {
            if (LOG.isDebugEnabled()) {
                LOG.debug("Sequence " + (j++) + ", score=" + df.format(sequence.getScore()));
                LOG.debug("Sequence: " + sequence);
            }
        }
    }
    return sequences;
}
Also used : ClassificationObserver(com.joliciel.talismane.machineLearning.ClassificationObserver) ZipInputStream(java.util.zip.ZipInputStream) SortedSet(java.util.SortedSet) PriorityQueue(java.util.PriorityQueue) LoggerFactory(org.slf4j.LoggerFactory) Scanner(java.util.Scanner) HashMap(java.util.HashMap) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) MachineLearningModelFactory(com.joliciel.talismane.machineLearning.MachineLearningModelFactory) TreeSet(java.util.TreeSet) TalismaneException(com.joliciel.talismane.TalismaneException) TalismaneSession(com.joliciel.talismane.TalismaneSession) ArrayList(java.util.ArrayList) ClassificationModel(com.joliciel.talismane.machineLearning.ClassificationModel) PosTaggerRule(com.joliciel.talismane.posTagger.features.PosTaggerRule) HashSet(java.util.HashSet) RuntimeEnvironment(com.joliciel.talismane.machineLearning.features.RuntimeEnvironment) PosTaggerFeature(com.joliciel.talismane.posTagger.features.PosTaggerFeature) FeatureResult(com.joliciel.talismane.machineLearning.features.FeatureResult) Map(java.util.Map) ConfigUtils(com.joliciel.talismane.utils.ConfigUtils) ConfigFactory(com.typesafe.config.ConfigFactory) ArrayListNoNulls(com.joliciel.talismane.utils.ArrayListNoNulls) ExternalResource(com.joliciel.talismane.machineLearning.ExternalResource) DecisionMaker(com.joliciel.talismane.machineLearning.DecisionMaker) StringAttribute(com.joliciel.talismane.tokeniser.StringAttribute) Logger(org.slf4j.Logger) Config(com.typesafe.config.Config) Collection(java.util.Collection) DecimalFormat(java.text.DecimalFormat) Set(java.util.Set) IOException(java.io.IOException) Decision(com.joliciel.talismane.machineLearning.Decision) Collectors(java.util.stream.Collectors) File(java.io.File) List(java.util.List) TreeMap(java.util.TreeMap) PosTaggerFeatureParser(com.joliciel.talismane.posTagger.features.PosTaggerFeatureParser) Token(com.joliciel.talismane.tokeniser.Token) Entry(java.util.Map.Entry) InputStream(java.io.InputStream) ArrayList(java.util.ArrayList) StringAttribute(com.joliciel.talismane.tokeniser.StringAttribute) Token(com.joliciel.talismane.tokeniser.Token) PosTaggerRule(com.joliciel.talismane.posTagger.features.PosTaggerRule) TreeSet(java.util.TreeSet) RuntimeEnvironment(com.joliciel.talismane.machineLearning.features.RuntimeEnvironment) PriorityQueue(java.util.PriorityQueue) TreeMap(java.util.TreeMap) Decision(com.joliciel.talismane.machineLearning.Decision) ClassificationObserver(com.joliciel.talismane.machineLearning.ClassificationObserver) TokenSequence(com.joliciel.talismane.tokeniser.TokenSequence) FeatureResult(com.joliciel.talismane.machineLearning.features.FeatureResult)

Example 20 with TalismaneException

use of com.joliciel.talismane.TalismaneException in project talismane by joliciel-informatique.

the class PosTagSequence method removeRoot.

/**
 * Remove a previously pre-pended root.
 */
public void removeRoot() {
    PosTaggedToken rootToken = null;
    if (this.size() > 0) {
        rootToken = this.get(0);
        if (!rootToken.getTag().equals(PosTag.ROOT_POS_TAG))
            rootToken = null;
    }
    if (rootToken != null) {
        Token emptyToken = rootToken.getToken();
        try {
            tokenSequence.removeEmptyToken(emptyToken);
        } catch (TalismaneException e) {
            // should never happen
            LOG.error(e.getMessage(), e);
            throw new RuntimeException(e);
        }
        this.remove(0);
        tokenSequence.setWithRoot(false);
        tokenSequence.reindex();
    }
}
Also used : TalismaneException(com.joliciel.talismane.TalismaneException) Token(com.joliciel.talismane.tokeniser.Token)

Aggregations

TalismaneException (com.joliciel.talismane.TalismaneException)47 ArrayList (java.util.ArrayList)27 Config (com.typesafe.config.Config)14 File (java.io.File)11 List (java.util.List)10 TreeSet (java.util.TreeSet)10 FeatureResult (com.joliciel.talismane.machineLearning.features.FeatureResult)9 IOException (java.io.IOException)9 HashMap (java.util.HashMap)9 Set (java.util.Set)9 Decision (com.joliciel.talismane.machineLearning.Decision)8 RuntimeEnvironment (com.joliciel.talismane.machineLearning.features.RuntimeEnvironment)8 PosTaggedToken (com.joliciel.talismane.posTagger.PosTaggedToken)8 Token (com.joliciel.talismane.tokeniser.Token)8 Map (java.util.Map)8 SortedSet (java.util.SortedSet)8 Collectors (java.util.stream.Collectors)8 Logger (org.slf4j.Logger)8 LoggerFactory (org.slf4j.LoggerFactory)8 Sentence (com.joliciel.talismane.rawText.Sentence)7