Search in sources :

Example 16 with PosTaggedToken

use of com.joliciel.talismane.posTagger.PosTaggedToken in project talismane by joliciel-informatique.

the class HistorySearchFeature method checkInternal.

@Override
public FeatureResult<PosTaggedTokenWrapper> checkInternal(PosTaggerContext context, RuntimeEnvironment env) throws TalismaneException {
    FeatureResult<PosTaggedTokenWrapper> featureResult = null;
    int startIndex = context.getToken().getIndex() - 1;
    int endIndex = 0;
    if (startIndexFeature != null) {
        FeatureResult<Integer> startIndexResult = startIndexFeature.check(context, env);
        if (startIndexResult != null) {
            startIndex = startIndexResult.getOutcome();
        } else {
            return featureResult;
        }
    }
    if (endIndexFeature != null) {
        FeatureResult<Integer> endIndexResult = endIndexFeature.check(context, env);
        if (endIndexResult != null) {
            endIndex = endIndexResult.getOutcome();
        } else {
            return featureResult;
        }
    }
    if (startIndex < 0)
        startIndex = 0;
    if (endIndex < 0)
        endIndex = 0;
    if (startIndex >= context.getHistory().size())
        startIndex = context.getHistory().size() - 1;
    if (endIndex >= context.getHistory().size())
        endIndex = context.getHistory().size() - 1;
    int step = -1;
    if (endIndex > startIndex)
        step = 1;
    PosTaggedToken matchingToken = null;
    for (int i = startIndex; (step < 0 && i >= 0 && i >= endIndex) || (step > 0 && i < context.getHistory().size() && i <= endIndex); i += step) {
        PosTaggedToken oneToken = context.getHistory().get(i);
        FeatureResult<Boolean> criterionResult = this.criterion.check(oneToken, env);
        if (criterionResult != null && criterionResult.getOutcome()) {
            matchingToken = oneToken;
            break;
        }
        if (stopCriterion != null) {
            FeatureResult<Boolean> stopCriterionResult = this.stopCriterion.check(oneToken, env);
            if (stopCriterionResult != null && stopCriterionResult.getOutcome()) {
                break;
            }
        }
    }
    if (matchingToken != null) {
        featureResult = this.generateResult(matchingToken);
    }
    return featureResult;
}
Also used : PosTaggedToken(com.joliciel.talismane.posTagger.PosTaggedToken)

Example 17 with PosTaggedToken

use of com.joliciel.talismane.posTagger.PosTaggedToken in project talismane by joliciel-informatique.

the class LemmaFeature method checkInternal.

@Override
protected FeatureResult<String> checkInternal(T context, RuntimeEnvironment env) throws TalismaneException {
    PosTaggedTokenWrapper innerWrapper = this.getToken(context, env);
    if (innerWrapper == null)
        return null;
    PosTaggedToken posTaggedToken = innerWrapper.getPosTaggedToken();
    if (posTaggedToken == null)
        return null;
    FeatureResult<String> featureResult = null;
    List<LexicalEntry> lexicalEntries = posTaggedToken.getLexicalEntries();
    if (lexicalEntries.size() > 0) {
        LexicalEntry lexicalEntry = lexicalEntries.get(0);
        featureResult = this.generateResult(lexicalEntry.getLemma());
    }
    return featureResult;
}
Also used : PosTaggedToken(com.joliciel.talismane.posTagger.PosTaggedToken) LexicalEntry(com.joliciel.talismane.lexicon.LexicalEntry)

Example 18 with PosTaggedToken

use of com.joliciel.talismane.posTagger.PosTaggedToken in project talismane by joliciel-informatique.

the class ParseFeatureTester method onNextParseConfiguration.

@Override
public void onNextParseConfiguration(ParseConfiguration parseConfiguration) throws TalismaneException {
    ParseConfiguration currentConfiguration = new ParseConfiguration(parseConfiguration.getPosTagSequence());
    for (Transition transition : parseConfiguration.getTransitions()) {
        StringBuilder sb = new StringBuilder();
        for (PosTaggedToken taggedToken : currentConfiguration.getPosTagSequence()) {
            if (taggedToken.equals(currentConfiguration.getStack().getFirst())) {
                sb.append(" #[" + taggedToken.getToken().getOriginalText().replace(' ', '_') + "/" + taggedToken.getTag().toString() + "]#");
            } else if (taggedToken.equals(currentConfiguration.getBuffer().getFirst())) {
                sb.append(" #[" + taggedToken.getToken().getOriginalText().replace(' ', '_') + "/" + taggedToken.getTag().toString() + "]#");
            } else {
                sb.append(" " + taggedToken.getToken().getOriginalText().replace(' ', '_') + "/" + taggedToken.getTag().toString());
            }
        }
        sb.append(" ## Line: " + parseConfiguration.getSentence().getStartLineNumber());
        if (LOG.isTraceEnabled())
            LOG.trace(sb.toString());
        List<FeatureResult<?>> parseFeatureResults = new ArrayList<FeatureResult<?>>();
        for (ParseConfigurationFeature<?> parseFeature : parseFeatures) {
            RuntimeEnvironment env = new RuntimeEnvironment();
            FeatureResult<?> featureResult = parseFeature.check(currentConfiguration, env);
            if (featureResult != null) {
                parseFeatureResults.add(featureResult);
                if (LOG.isTraceEnabled()) {
                    LOG.trace(featureResult.toString());
                }
            }
        }
        String classification = transition.getCode();
        for (FeatureResult<?> featureResult : parseFeatureResults) {
            Map<String, List<String>> classificationMap = featureResultMap.get(featureResult.toString());
            if (classificationMap == null) {
                classificationMap = new TreeMap<String, List<String>>();
                featureResultMap.put(featureResult.toString(), classificationMap);
            }
            List<String> sentences = classificationMap.get(classification);
            if (sentences == null) {
                sentences = new ArrayList<String>();
                classificationMap.put(classification, sentences);
            }
            sentences.add(sb.toString());
        }
        // apply the transition and up the index
        currentConfiguration = new ParseConfiguration(currentConfiguration);
        transition.apply(currentConfiguration);
    }
}
Also used : PosTaggedToken(com.joliciel.talismane.posTagger.PosTaggedToken) RuntimeEnvironment(com.joliciel.talismane.machineLearning.features.RuntimeEnvironment) ArrayList(java.util.ArrayList) ParseConfiguration(com.joliciel.talismane.parser.ParseConfiguration) Transition(com.joliciel.talismane.parser.Transition) ArrayList(java.util.ArrayList) List(java.util.List) FeatureResult(com.joliciel.talismane.machineLearning.features.FeatureResult)

Example 19 with PosTaggedToken

use of com.joliciel.talismane.posTagger.PosTaggedToken in project talismane by joliciel-informatique.

the class ParseOutputRewriter method getCorpusLines.

List<CorpusLine> getCorpusLines(ParseConfiguration parseConfiguration) throws TalismaneException {
    // first convert the parse configuration to a list of corpus lines
    List<CorpusLine> corpusLines = new ArrayList<>();
    for (PosTaggedToken posTaggedToken : parseConfiguration.getPosTagSequence()) {
        if (!posTaggedToken.isRoot()) {
            DependencyArc arc = parseConfiguration.getGoverningDependency(posTaggedToken);
            DependencyArc nonProjArc = parseConfiguration.getGoverningDependency(posTaggedToken, false);
            String line = posTaggedToken.getIndex() + "\t" + posTaggedToken.getToken().getOriginalText() + "\t" + posTaggedToken.getLemmaForCoNLL() + "\t" + posTaggedToken.getTag().getCode() + "\t" + posTaggedToken.getTag().getCode() + "\t" + posTaggedToken.getMorphologyForCoNLL() + "\t" + (arc != null ? arc.getHead().getIndex() : 0) + "\t" + (arc != null ? arc.getLabel() : "_");
            CorpusLine corpusLine = new CorpusLine(line, posTaggedToken.getToken().getLineNumber());
            corpusLine.setIndex(posTaggedToken.getIndex());
            corpusLine.setToken(posTaggedToken.getToken().getOriginalText());
            corpusLine.setLemma(posTaggedToken.getLemmaForCoNLL());
            corpusLine.setPosTag(posTaggedToken.getTag().getCode());
            String morphology = posTaggedToken.getMorphologyForCoNLL();
            corpusLine.setMorphology(morphology.length() == 0 ? "_" : morphology);
            corpusLine.setGovernorIndex(arc != null ? arc.getHead().getIndex() : 0);
            corpusLine.setLabel(arc != null ? arc.getLabel() : "_");
            corpusLine.setNonProjGovernorIndex(nonProjArc != null ? nonProjArc.getHead().getIndex() : 0);
            corpusLine.setNonProjLabel(nonProjArc != null ? nonProjArc.getLabel() : "_");
            if (posTaggedToken.getToken().getPrecedingRawOutput() != null)
                corpusLine.setElement(CorpusElement.PRECEDING_RAW_OUTPUT, posTaggedToken.getToken().getPrecedingRawOutput());
            if (posTaggedToken.getToken().getTrailingRawOutput() != null)
                corpusLine.setElement(CorpusElement.TRAILING_RAW_OUTPUT, posTaggedToken.getToken().getTrailingRawOutput());
            corpusLine.setTokenProbability(posTaggedToken.getToken().getProbability());
            corpusLine.setPosTagProbability(posTaggedToken.getProbability());
            if (arc != null)
                corpusLine.setParseProbability(arc.getProbability());
            corpusLines.add(corpusLine);
        }
    }
    Map<CorpusLine, SplitAction> splitActions = new HashMap<>();
    for (CorpusLine corpusLine : corpusLines) {
        if (LOG.isDebugEnabled())
            LOG.debug(corpusLine.toString());
        for (RewriteRule rewriteRule : rewriteRules) {
            boolean matches = true;
            conditionLoop: for (CorpusElement corpusElement : rewriteRule.conditions.keySet()) {
                Pattern pattern = rewriteRule.conditions.get(corpusElement);
                if (LOG.isTraceEnabled())
                    LOG.trace("For " + corpusElement.name() + ", matching " + pattern.pattern());
                switch(corpusElement) {
                    case POSTAG:
                        if (!pattern.matcher(corpusLine.getPosTag()).matches()) {
                            if (LOG.isTraceEnabled())
                                LOG.trace("Match failed for " + corpusLine.getPosTag());
                            matches = false;
                            break conditionLoop;
                        }
                        break;
                    case TOKEN:
                        if (!pattern.matcher(corpusLine.getToken()).matches()) {
                            matches = false;
                            break conditionLoop;
                        }
                        break;
                    case LEMMA:
                        if (!pattern.matcher(corpusLine.getLemma()).matches()) {
                            matches = false;
                            break conditionLoop;
                        }
                        break;
                    case LABEL:
                        if (!pattern.matcher(corpusLine.getLabel()).matches()) {
                            matches = false;
                            break conditionLoop;
                        }
                        break;
                    default:
                        throw new TalismaneException(ParseOutputRewriter.class.getSimpleName() + " cannot match on " + corpusElement.name());
                }
            }
            if (matches) {
                if (rewriteRule.action instanceof SplitAction) {
                    SplitAction splitAction = (SplitAction) rewriteRule.action;
                    splitActions.put(corpusLine, splitAction);
                }
            }
        }
    }
    if (splitActions.size() > 0) {
        List<CorpusLine> newCorpusLines = new ArrayList<>();
        Map<Integer, Integer> oldToNewIndexMap = new HashMap<>();
        oldToNewIndexMap.put(0, 0);
        int currentIndex = 1;
        for (int i = 0; i < corpusLines.size(); i++) {
            CorpusLine corpusLine = corpusLines.get(i);
            oldToNewIndexMap.put(i + 1, currentIndex);
            if (splitActions.containsKey(corpusLine)) {
                SplitAction splitAction = splitActions.get(corpusLine);
                currentIndex += splitAction.elementValues.size();
            } else {
                currentIndex++;
            }
        }
        for (int i = 0; i < corpusLines.size(); i++) {
            CorpusLine corpusLine = corpusLines.get(i);
            CorpusLine newCorpusLine = corpusLine.cloneCorpusLine();
            newCorpusLine.setIndex(oldToNewIndexMap.get(corpusLine.getIndex()));
            newCorpusLine.setGovernorIndex(oldToNewIndexMap.get(corpusLine.getGovernorIndex()));
            newCorpusLine.setNonProjGovernorIndex(oldToNewIndexMap.get(corpusLine.getNonProjGovernorIndex()));
            if (splitActions.containsKey(corpusLine)) {
                SplitAction splitAction = splitActions.get(corpusLine);
                for (int j = 0; j < splitAction.elementValues.size(); j++) {
                    CorpusLine splitCorpusLine = new CorpusLine(corpusLine.getLine(), corpusLine.getLineNumber());
                    splitCorpusLine.setIndex(oldToNewIndexMap.get(corpusLine.getIndex()) + j);
                    Map<CorpusElement, String> elementValues = splitAction.elementValues.get(j);
                    this.setElementValues(elementValues, oldToNewIndexMap, newCorpusLine, splitCorpusLine);
                    // The first matching element in each group will be applied
                    // The default element marks the end of each group, and will be
                    // applied if no other match has applied.
                    List<ConditionalAction> conditionalActions = splitAction.conditionalValues.get(j);
                    boolean groupHasMatch = false;
                    for (ConditionalAction conditionalAction : conditionalActions) {
                        CorpusLine baseLine = corpusLines.get(i + conditionalAction.relativeIndex);
                        if (conditionalAction.isDefault) {
                            if (!groupHasMatch) {
                                Map<CorpusElement, String> conditionalElementValues = conditionalAction.elementValues;
                                this.setElementValues(conditionalElementValues, oldToNewIndexMap, newCorpusLine, splitCorpusLine);
                            }
                            // The default action marks the end of each matching group.
                            groupHasMatch = false;
                        } else {
                            boolean match = true;
                            for (CorpusElement corpusElement : conditionalAction.conditions.keySet()) {
                                String origValue = baseLine.getElement(corpusElement);
                                Pattern pattern = conditionalAction.conditions.get(corpusElement);
                                if (!pattern.matcher(origValue).matches()) {
                                    match = false;
                                    break;
                                }
                            }
                            if (match) {
                                Map<CorpusElement, String> conditionalElementValues = conditionalAction.elementValues;
                                this.setElementValues(conditionalElementValues, oldToNewIndexMap, newCorpusLine, splitCorpusLine);
                                groupHasMatch = true;
                            }
                        // did this action match?
                        }
                    // default action?
                    }
                    // next conditional action
                    newCorpusLines.add(splitCorpusLine);
                }
            // next split
            } else {
                newCorpusLines.add(newCorpusLine);
            }
        // should line be split?
        }
        // next corpus line
        corpusLines = newCorpusLines;
    }
    return corpusLines;
}
Also used : CorpusElement(com.joliciel.talismane.corpus.CorpusLine.CorpusElement) Pattern(java.util.regex.Pattern) PosTaggedToken(com.joliciel.talismane.posTagger.PosTaggedToken) HashMap(java.util.HashMap) TalismaneException(com.joliciel.talismane.TalismaneException) ArrayList(java.util.ArrayList) CorpusLine(com.joliciel.talismane.corpus.CorpusLine) DependencyArc(com.joliciel.talismane.parser.DependencyArc)

Example 20 with PosTaggedToken

use of com.joliciel.talismane.posTagger.PosTaggedToken in project talismane by joliciel-informatique.

the class TransitionLogWriter method onParseEnd.

@Override
public void onParseEnd(ParseConfiguration refConfiguration, List<ParseConfiguration> guessedConfigurations) throws TalismaneException, IOException {
    boolean includeMe = true;
    if (errorLabels != null && errorLabels.size() > 0) {
        includeMe = false;
        int i = 0;
        ParseConfiguration guessConfiguration = guessedConfigurations.get(0);
        Set<PosTaggedToken> refTokensToExplain = new HashSet<PosTaggedToken>();
        Set<PosTaggedToken> guessTokensToExplain = new HashSet<PosTaggedToken>();
        Set<PosTaggedToken> refTokensToHighlight = new HashSet<PosTaggedToken>();
        Set<PosTaggedToken> guessTokensToHighlight = new HashSet<PosTaggedToken>();
        for (PosTaggedToken refToken : refConfiguration.getPosTagSequence()) {
            if (i != 0) {
                DependencyArc refArc = refConfiguration.getGoverningDependency(refToken);
                if (refArc != null) {
                    PosTaggedToken guessToken = guessConfiguration.getPosTagSequence().get(i);
                    if (errorLabels.contains(refArc.getLabel())) {
                        DependencyArc guessArc = guessConfiguration.getGoverningDependency(guessToken);
                        if (guessArc == null || !refArc.getLabel().equals(guessArc.getLabel()) || (refArc.getHead() == null && guessArc.getHead() != null) || (refArc.getHead() != null && guessArc.getHead() == null) || refArc.getHead().getIndex() != guessArc.getHead().getIndex()) {
                            refTokensToExplain.add(refToken);
                            if (refArc.getHead() != null)
                                refTokensToHighlight.add(refArc.getHead());
                            guessTokensToExplain.add(guessToken);
                            if (guessArc != null && guessArc.getHead() != null)
                                guessTokensToHighlight.add(guessArc.getHead());
                            includeMe = true;
                        }
                    }
                }
            // have refArc
            }
            i++;
        }
        StringBuilder refBuilder = new StringBuilder();
        for (PosTaggedToken refToken : refConfiguration.getPosTagSequence()) {
            if (refTokensToExplain.contains(refToken)) {
                DependencyArc refArc = refConfiguration.getGoverningDependency(refToken);
                if (refArc == null)
                    refBuilder.append("#" + refToken.getToken().getOriginalText().replace(' ', '_') + "|" + refToken.getTag().getCode() + "|" + refToken.getIndex() + "|Gov0|null# ");
                else
                    refBuilder.append("#" + refToken.getToken().getOriginalText().replace(' ', '_') + "|" + refToken.getTag().getCode() + "|" + refToken.getIndex() + "|Gov" + (refArc.getHead() == null ? 0 : refArc.getHead().getIndex()) + "|" + refArc.getLabel() + "# ");
            } else if (refTokensToHighlight.contains(refToken)) {
                refBuilder.append("#" + refToken.getToken().getOriginalText().replace(' ', '_') + "|" + refToken.getTag().getCode() + "|" + refToken.getIndex() + "# ");
            } else {
                refBuilder.append(refToken.getToken().getOriginalText().replace(' ', '_') + "|" + refToken.getTag().getCode() + "|" + refToken.getIndex() + " ");
            }
        }
        StringBuilder guessBuilder = new StringBuilder();
        for (PosTaggedToken guessToken : guessConfiguration.getPosTagSequence()) {
            if (guessTokensToExplain.contains(guessToken)) {
                DependencyArc guessArc = guessConfiguration.getGoverningDependency(guessToken);
                if (guessArc == null)
                    guessBuilder.append("#" + guessToken.getToken().getOriginalText().replace(' ', '_') + "|" + guessToken.getTag().getCode() + "|" + guessToken.getIndex() + "|Gov0|null# ");
                else
                    guessBuilder.append("#" + guessToken.getToken().getOriginalText().replace(' ', '_') + "|" + guessToken.getTag().getCode() + "|" + guessToken.getIndex() + "|Gov" + (guessArc.getHead() == null ? 0 : guessArc.getHead().getIndex()) + "|" + guessArc.getLabel() + "# ");
            } else if (guessTokensToHighlight.contains(guessToken)) {
                guessBuilder.append("#" + guessToken.getToken().getOriginalText().replace(' ', '_') + "|" + guessToken.getTag().getCode() + "|" + guessToken.getIndex() + "# ");
            } else {
                guessBuilder.append(guessToken.getToken().getOriginalText().replace(' ', '_') + "|" + guessToken.getTag().getCode() + "|" + guessToken.getIndex() + " ");
            }
        }
        if (includeMe) {
            writer.write("\n");
            writer.write(refBuilder.toString() + "\n");
            writer.write(guessBuilder.toString() + "\n");
        }
    }
    if (includeMe)
        this.onNextParseConfiguration(guessedConfigurations.get(0));
}
Also used : PosTaggedToken(com.joliciel.talismane.posTagger.PosTaggedToken) DependencyArc(com.joliciel.talismane.parser.DependencyArc) ParseConfiguration(com.joliciel.talismane.parser.ParseConfiguration) HashSet(java.util.HashSet)

Aggregations

PosTaggedToken (com.joliciel.talismane.posTagger.PosTaggedToken)77 ParseConfiguration (com.joliciel.talismane.parser.ParseConfiguration)24 PosTaggedTokenWrapper (com.joliciel.talismane.posTagger.features.PosTaggedTokenWrapper)20 PosTagSequence (com.joliciel.talismane.posTagger.PosTagSequence)14 Token (com.joliciel.talismane.tokeniser.Token)11 DependencyArc (com.joliciel.talismane.parser.DependencyArc)9 TalismaneException (com.joliciel.talismane.TalismaneException)8 Decision (com.joliciel.talismane.machineLearning.Decision)8 RuntimeEnvironment (com.joliciel.talismane.machineLearning.features.RuntimeEnvironment)8 Sentence (com.joliciel.talismane.rawText.Sentence)8 TokenSequence (com.joliciel.talismane.tokeniser.TokenSequence)8 HashMap (java.util.HashMap)7 List (java.util.List)7 TalismaneTest (com.joliciel.talismane.TalismaneTest)6 PosTaggerContext (com.joliciel.talismane.posTagger.PosTaggerContext)6 PosTaggerContextImpl (com.joliciel.talismane.posTagger.PosTaggerContextImpl)6 Config (com.typesafe.config.Config)6 ArrayList (java.util.ArrayList)6 Test (org.junit.Test)6 StringLiteralFeature (com.joliciel.talismane.machineLearning.features.StringLiteralFeature)5