Search in sources :

Example 1 with CorpusLine

use of com.joliciel.talismane.corpus.CorpusLine in project talismane by joliciel-informatique.

the class ParserRegexBasedCorpusReader method processSentence.

@Override
protected void processSentence(Sentence sentence, List<CorpusLine> corpusLines) throws TalismaneException, IOException {
    try {
        super.processSentence(sentence, corpusLines);
        PosTaggedToken rootToken = posTagSequence.prependRoot();
        idTokenMap.put(0, rootToken);
        TransitionSystem transitionSystem = TalismaneSession.get(sessionId).getTransitionSystem();
        Set<DependencyArc> dependencies = new TreeSet<>();
        for (CorpusLine dataLine : corpusLines) {
            int headIndex = 0;
            if (dataLine.hasElement(CorpusElement.GOVERNOR))
                headIndex = Integer.parseInt(dataLine.getElement(CorpusElement.GOVERNOR));
            PosTaggedToken head = idTokenMap.get(headIndex);
            PosTaggedToken dependent = idTokenMap.get(dataLine.getIndex());
            String dependencyLabel = dataLine.getElement(CorpusElement.LABEL);
            if (transitionSystem.getDependencyLabels().size() > 1) {
                if (dependencyLabel.length() > 0 && !transitionSystem.getDependencyLabels().contains(dependencyLabel)) {
                    throw new UnknownDependencyLabelException((this.getCurrentFile() == null ? "" : this.getCurrentFile().getPath()), dataLine.getLineNumber(), dependencyLabel);
                }
                String nonProjectiveLabel = dataLine.getElement(CorpusElement.NON_PROJ_LABEL);
                if (nonProjectiveLabel != null && nonProjectiveLabel.length() > 0 && !transitionSystem.getDependencyLabels().contains(nonProjectiveLabel)) {
                    throw new UnknownDependencyLabelException((this.getCurrentFile() == null ? "" : this.getCurrentFile().getPath()), dataLine.getLineNumber(), nonProjectiveLabel);
                }
            }
            DependencyArc arc = new DependencyArc(head, dependent, dependencyLabel);
            if (LOG.isTraceEnabled())
                LOG.trace(arc.toString());
            dependencies.add(arc);
            if (dataLine.hasElement(CorpusElement.DEP_COMMENT))
                arc.setComment(dataLine.getElement(CorpusElement.DEP_COMMENT));
        }
        configuration = new ParseConfiguration(posTagSequence);
        if (this.predictTransitions) {
            transitionSystem.predictTransitions(configuration, dependencies);
        } else {
            for (DependencyArc arc : dependencies) {
                configuration.addDependency(arc.getHead(), arc.getDependent(), arc.getLabel(), null);
            }
        }
        // if there are any
        if (this.getCorpusLineReader().hasPlaceholder(CorpusElement.NON_PROJ_GOVERNOR)) {
            Set<DependencyArc> nonProjDeps = new TreeSet<>();
            if (LOG.isTraceEnabled())
                LOG.trace("Non projective dependencies: ");
            for (CorpusLine dataLine : corpusLines) {
                int headIndex = 0;
                if (dataLine.hasElement(CorpusElement.NON_PROJ_GOVERNOR))
                    headIndex = Integer.parseInt(dataLine.getElement(CorpusElement.NON_PROJ_GOVERNOR));
                PosTaggedToken head = idTokenMap.get(headIndex);
                PosTaggedToken dependent = idTokenMap.get(dataLine.getIndex());
                DependencyArc nonProjArc = new DependencyArc(head, dependent, dataLine.getElement(CorpusElement.NON_PROJ_LABEL));
                if (LOG.isTraceEnabled())
                    LOG.trace(nonProjArc.toString());
                nonProjDeps.add(nonProjArc);
                if (dataLine.hasElement(CorpusElement.DEP_COMMENT))
                    nonProjArc.setComment(dataLine.getElement(CorpusElement.DEP_COMMENT));
            }
            for (DependencyArc nonProjArc : nonProjDeps) {
                configuration.addManualNonProjectiveDependency(nonProjArc.getHead(), nonProjArc.getDependent(), nonProjArc.getLabel());
            }
        }
    } catch (TalismaneException e) {
        this.clearSentence();
        throw e;
    }
}
Also used : PosTaggedToken(com.joliciel.talismane.posTagger.PosTaggedToken) TalismaneException(com.joliciel.talismane.TalismaneException) TreeSet(java.util.TreeSet) CorpusLine(com.joliciel.talismane.corpus.CorpusLine)

Example 2 with CorpusLine

use of com.joliciel.talismane.corpus.CorpusLine in project talismane by joliciel-informatique.

the class ParseOutputRewriter method getCorpusLines.

List<CorpusLine> getCorpusLines(ParseConfiguration parseConfiguration) throws TalismaneException {
    // first convert the parse configuration to a list of corpus lines
    List<CorpusLine> corpusLines = new ArrayList<>();
    for (PosTaggedToken posTaggedToken : parseConfiguration.getPosTagSequence()) {
        if (!posTaggedToken.isRoot()) {
            DependencyArc arc = parseConfiguration.getGoverningDependency(posTaggedToken);
            DependencyArc nonProjArc = parseConfiguration.getGoverningDependency(posTaggedToken, false);
            String line = posTaggedToken.getIndex() + "\t" + posTaggedToken.getToken().getOriginalText() + "\t" + posTaggedToken.getLemmaForCoNLL() + "\t" + posTaggedToken.getTag().getCode() + "\t" + posTaggedToken.getTag().getCode() + "\t" + posTaggedToken.getMorphologyForCoNLL() + "\t" + (arc != null ? arc.getHead().getIndex() : 0) + "\t" + (arc != null ? arc.getLabel() : "_");
            CorpusLine corpusLine = new CorpusLine(line, posTaggedToken.getToken().getLineNumber());
            corpusLine.setIndex(posTaggedToken.getIndex());
            corpusLine.setToken(posTaggedToken.getToken().getOriginalText());
            corpusLine.setLemma(posTaggedToken.getLemmaForCoNLL());
            corpusLine.setPosTag(posTaggedToken.getTag().getCode());
            String morphology = posTaggedToken.getMorphologyForCoNLL();
            corpusLine.setMorphology(morphology.length() == 0 ? "_" : morphology);
            corpusLine.setGovernorIndex(arc != null ? arc.getHead().getIndex() : 0);
            corpusLine.setLabel(arc != null ? arc.getLabel() : "_");
            corpusLine.setNonProjGovernorIndex(nonProjArc != null ? nonProjArc.getHead().getIndex() : 0);
            corpusLine.setNonProjLabel(nonProjArc != null ? nonProjArc.getLabel() : "_");
            if (posTaggedToken.getToken().getPrecedingRawOutput() != null)
                corpusLine.setElement(CorpusElement.PRECEDING_RAW_OUTPUT, posTaggedToken.getToken().getPrecedingRawOutput());
            if (posTaggedToken.getToken().getTrailingRawOutput() != null)
                corpusLine.setElement(CorpusElement.TRAILING_RAW_OUTPUT, posTaggedToken.getToken().getTrailingRawOutput());
            corpusLine.setTokenProbability(posTaggedToken.getToken().getProbability());
            corpusLine.setPosTagProbability(posTaggedToken.getProbability());
            if (arc != null)
                corpusLine.setParseProbability(arc.getProbability());
            corpusLines.add(corpusLine);
        }
    }
    Map<CorpusLine, SplitAction> splitActions = new HashMap<>();
    for (CorpusLine corpusLine : corpusLines) {
        if (LOG.isDebugEnabled())
            LOG.debug(corpusLine.toString());
        for (RewriteRule rewriteRule : rewriteRules) {
            boolean matches = true;
            conditionLoop: for (CorpusElement corpusElement : rewriteRule.conditions.keySet()) {
                Pattern pattern = rewriteRule.conditions.get(corpusElement);
                if (LOG.isTraceEnabled())
                    LOG.trace("For " + corpusElement.name() + ", matching " + pattern.pattern());
                switch(corpusElement) {
                    case POSTAG:
                        if (!pattern.matcher(corpusLine.getPosTag()).matches()) {
                            if (LOG.isTraceEnabled())
                                LOG.trace("Match failed for " + corpusLine.getPosTag());
                            matches = false;
                            break conditionLoop;
                        }
                        break;
                    case TOKEN:
                        if (!pattern.matcher(corpusLine.getToken()).matches()) {
                            matches = false;
                            break conditionLoop;
                        }
                        break;
                    case LEMMA:
                        if (!pattern.matcher(corpusLine.getLemma()).matches()) {
                            matches = false;
                            break conditionLoop;
                        }
                        break;
                    case LABEL:
                        if (!pattern.matcher(corpusLine.getLabel()).matches()) {
                            matches = false;
                            break conditionLoop;
                        }
                        break;
                    default:
                        throw new TalismaneException(ParseOutputRewriter.class.getSimpleName() + " cannot match on " + corpusElement.name());
                }
            }
            if (matches) {
                if (rewriteRule.action instanceof SplitAction) {
                    SplitAction splitAction = (SplitAction) rewriteRule.action;
                    splitActions.put(corpusLine, splitAction);
                }
            }
        }
    }
    if (splitActions.size() > 0) {
        List<CorpusLine> newCorpusLines = new ArrayList<>();
        Map<Integer, Integer> oldToNewIndexMap = new HashMap<>();
        oldToNewIndexMap.put(0, 0);
        int currentIndex = 1;
        for (int i = 0; i < corpusLines.size(); i++) {
            CorpusLine corpusLine = corpusLines.get(i);
            oldToNewIndexMap.put(i + 1, currentIndex);
            if (splitActions.containsKey(corpusLine)) {
                SplitAction splitAction = splitActions.get(corpusLine);
                currentIndex += splitAction.elementValues.size();
            } else {
                currentIndex++;
            }
        }
        for (int i = 0; i < corpusLines.size(); i++) {
            CorpusLine corpusLine = corpusLines.get(i);
            CorpusLine newCorpusLine = corpusLine.cloneCorpusLine();
            newCorpusLine.setIndex(oldToNewIndexMap.get(corpusLine.getIndex()));
            newCorpusLine.setGovernorIndex(oldToNewIndexMap.get(corpusLine.getGovernorIndex()));
            newCorpusLine.setNonProjGovernorIndex(oldToNewIndexMap.get(corpusLine.getNonProjGovernorIndex()));
            if (splitActions.containsKey(corpusLine)) {
                SplitAction splitAction = splitActions.get(corpusLine);
                for (int j = 0; j < splitAction.elementValues.size(); j++) {
                    CorpusLine splitCorpusLine = new CorpusLine(corpusLine.getLine(), corpusLine.getLineNumber());
                    splitCorpusLine.setIndex(oldToNewIndexMap.get(corpusLine.getIndex()) + j);
                    Map<CorpusElement, String> elementValues = splitAction.elementValues.get(j);
                    this.setElementValues(elementValues, oldToNewIndexMap, newCorpusLine, splitCorpusLine);
                    // The first matching element in each group will be applied
                    // The default element marks the end of each group, and will be
                    // applied if no other match has applied.
                    List<ConditionalAction> conditionalActions = splitAction.conditionalValues.get(j);
                    boolean groupHasMatch = false;
                    for (ConditionalAction conditionalAction : conditionalActions) {
                        CorpusLine baseLine = corpusLines.get(i + conditionalAction.relativeIndex);
                        if (conditionalAction.isDefault) {
                            if (!groupHasMatch) {
                                Map<CorpusElement, String> conditionalElementValues = conditionalAction.elementValues;
                                this.setElementValues(conditionalElementValues, oldToNewIndexMap, newCorpusLine, splitCorpusLine);
                            }
                            // The default action marks the end of each matching group.
                            groupHasMatch = false;
                        } else {
                            boolean match = true;
                            for (CorpusElement corpusElement : conditionalAction.conditions.keySet()) {
                                String origValue = baseLine.getElement(corpusElement);
                                Pattern pattern = conditionalAction.conditions.get(corpusElement);
                                if (!pattern.matcher(origValue).matches()) {
                                    match = false;
                                    break;
                                }
                            }
                            if (match) {
                                Map<CorpusElement, String> conditionalElementValues = conditionalAction.elementValues;
                                this.setElementValues(conditionalElementValues, oldToNewIndexMap, newCorpusLine, splitCorpusLine);
                                groupHasMatch = true;
                            }
                        // did this action match?
                        }
                    // default action?
                    }
                    // next conditional action
                    newCorpusLines.add(splitCorpusLine);
                }
            // next split
            } else {
                newCorpusLines.add(newCorpusLine);
            }
        // should line be split?
        }
        // next corpus line
        corpusLines = newCorpusLines;
    }
    return corpusLines;
}
Also used : CorpusElement(com.joliciel.talismane.corpus.CorpusLine.CorpusElement) Pattern(java.util.regex.Pattern) PosTaggedToken(com.joliciel.talismane.posTagger.PosTaggedToken) HashMap(java.util.HashMap) TalismaneException(com.joliciel.talismane.TalismaneException) ArrayList(java.util.ArrayList) CorpusLine(com.joliciel.talismane.corpus.CorpusLine) DependencyArc(com.joliciel.talismane.parser.DependencyArc)

Example 3 with CorpusLine

use of com.joliciel.talismane.corpus.CorpusLine in project talismane by joliciel-informatique.

the class ParseOutputRewriterTest method testGetCorpusLines.

@Test
public void testGetCorpusLines() throws Exception {
    TalismaneSession.clearSessions();
    System.setProperty("config.file", "src/test/resources/testWithOutputRules.conf");
    ConfigFactory.invalidateCaches();
    final Config config = ConfigFactory.load();
    final String sessionId = "test";
    String input = "";
    input += "1\tAu\tau\tADP+DET\t0\troot\n";
    input += "2\tsein\tsein\tNOUN\t1\tfixed\n";
    input += "3\tmême\tmême\tADV\t1\tadvmod\n";
    input += "4\tdu\tdu\tADP+DET\t5\tcase\n";
    input += "5\tParti\tParti\tPROPN\t1\tnmod\n";
    input += "6\tsocialiste\tsocialiste\tADJ\t5\tfixed\n";
    input += "7\tauquel\tauquel\tADP+PRON\t8\tobl\n";
    input += "8\tappartient\tappartenir\tVERB\t5\tacl:relcl\n";
    input += "9\tM.\tmonsieur\tNOUN\t8\tnsubj\n";
    input += "10\tDupont\tDupont\tPROPN\t9\tflat:name\n";
    StringReader stringReader = new StringReader(input);
    ParserRegexBasedCorpusReader reader = new ParserRegexBasedCorpusReader(stringReader, config.getConfig("talismane.core.test.parser.input"), sessionId);
    ParseConfiguration parseConfiguration = reader.nextConfiguration();
    final StringWriter writer = new StringWriter();
    try (ParseOutputRewriter rewriter = new ParseOutputRewriter(writer, sessionId)) {
        List<CorpusLine> corpusLines = rewriter.getCorpusLines(parseConfiguration);
        int i = 1;
        for (CorpusLine corpusLine : corpusLines) {
            LOG.debug("line " + corpusLine.getIndex() + ": " + corpusLine.getElements());
            if (i == 1) {
                assertEquals(1, corpusLine.getIndex());
                assertEquals("à", corpusLine.getToken());
                assertEquals("à", corpusLine.getLemma());
                assertEquals("ADP", corpusLine.getPosTag());
                assertEquals(0, corpusLine.getGovernorIndex());
                assertEquals("root", corpusLine.getLabel());
            } else if (i == 2) {
                assertEquals(2, corpusLine.getIndex());
                assertEquals("le", corpusLine.getToken());
                assertEquals("le", corpusLine.getLemma());
                assertEquals("DET", corpusLine.getPosTag());
                assertEquals(1, corpusLine.getGovernorIndex());
                assertEquals("fixed", corpusLine.getLabel());
            } else if (i == 3) {
                assertEquals(3, corpusLine.getIndex());
                assertEquals("sein", corpusLine.getToken());
                assertEquals(1, corpusLine.getGovernorIndex());
                assertEquals("fixed", corpusLine.getLabel());
            } else if (i == 4) {
                assertEquals(4, corpusLine.getIndex());
                assertEquals("même", corpusLine.getToken());
                assertEquals(1, corpusLine.getGovernorIndex());
                assertEquals("advmod", corpusLine.getLabel());
            } else if (i == 5) {
                assertEquals(5, corpusLine.getIndex());
                assertEquals("de", corpusLine.getToken());
                assertEquals("de", corpusLine.getLemma());
                assertEquals("ADP", corpusLine.getPosTag());
                assertEquals(7, corpusLine.getGovernorIndex());
                assertEquals("case", corpusLine.getLabel());
            } else if (i == 6) {
                assertEquals(6, corpusLine.getIndex());
                assertEquals("le", corpusLine.getToken());
                assertEquals("le", corpusLine.getLemma());
                assertEquals("DET", corpusLine.getPosTag());
                assertEquals(7, corpusLine.getGovernorIndex());
                assertEquals("det", corpusLine.getLabel());
            } else if (i == 7) {
                assertEquals(7, corpusLine.getIndex());
                assertEquals("Parti", corpusLine.getToken());
                assertEquals(1, corpusLine.getGovernorIndex());
                assertEquals("nmod", corpusLine.getLabel());
            } else if (i == 8) {
                assertEquals(8, corpusLine.getIndex());
                assertEquals("socialiste", corpusLine.getToken());
                assertEquals(7, corpusLine.getGovernorIndex());
                assertEquals("fixed", corpusLine.getLabel());
            } else if (i == 9) {
                assertEquals(9, corpusLine.getIndex());
                assertEquals("à", corpusLine.getToken());
                assertEquals("à", corpusLine.getLemma());
                assertEquals("ADP", corpusLine.getPosTag());
                assertEquals(10, corpusLine.getGovernorIndex());
                assertEquals("case", corpusLine.getLabel());
            } else if (i == 10) {
                assertEquals(10, corpusLine.getIndex());
                assertEquals("lequel", corpusLine.getToken());
                assertEquals("lequel", corpusLine.getLemma());
                assertEquals("PRON", corpusLine.getPosTag());
                assertEquals(11, corpusLine.getGovernorIndex());
                assertEquals("obl", corpusLine.getLabel());
            } else if (i == 11) {
                assertEquals(11, corpusLine.getIndex());
                assertEquals("appartient", corpusLine.getToken());
                assertEquals("VERB", corpusLine.getPosTag());
                assertEquals(7, corpusLine.getGovernorIndex());
                assertEquals("acl:relcl", corpusLine.getLabel());
            } else if (i == 12) {
                assertEquals(12, corpusLine.getIndex());
                assertEquals("M.", corpusLine.getToken());
                assertEquals("NOUN", corpusLine.getPosTag());
                assertEquals(11, corpusLine.getGovernorIndex());
                assertEquals("nsubj", corpusLine.getLabel());
            } else if (i == 13) {
                assertEquals(13, corpusLine.getIndex());
                assertEquals("Dupont", corpusLine.getToken());
                assertEquals("PROPN", corpusLine.getPosTag());
                assertEquals(12, corpusLine.getGovernorIndex());
                assertEquals("flat:name", corpusLine.getLabel());
            }
            i++;
        }
        assertEquals(13, corpusLines.size());
    }
}
Also used : StringWriter(java.io.StringWriter) Config(com.typesafe.config.Config) StringReader(java.io.StringReader) CorpusLine(com.joliciel.talismane.corpus.CorpusLine) ParserRegexBasedCorpusReader(com.joliciel.talismane.parser.ParserRegexBasedCorpusReader) ParseConfiguration(com.joliciel.talismane.parser.ParseConfiguration) TalismaneTest(com.joliciel.talismane.TalismaneTest) Test(org.junit.Test)

Example 4 with CorpusLine

use of com.joliciel.talismane.corpus.CorpusLine in project talismane by joliciel-informatique.

the class PosTagRegexBasedCorpusReader method processSentence.

@Override
protected void processSentence(Sentence sentence, List<CorpusLine> corpusLines) throws TalismaneException, IOException {
    try {
        super.processSentence(sentence, corpusLines);
        posTagSequence = new PosTagSequence(tokenSequence);
        int i = 0;
        for (CorpusLine corpusLine : corpusLines) {
            PosTaggedToken posTaggedToken = this.convertToPosTaggedToken(corpusLine, posTagSequence, i++, this.getCurrentFile());
            this.idTokenMap.put(corpusLine.getIndex(), posTaggedToken);
        }
    } catch (TalismaneException e) {
        this.clearSentence();
        throw e;
    }
}
Also used : TalismaneException(com.joliciel.talismane.TalismaneException) CorpusLine(com.joliciel.talismane.corpus.CorpusLine)

Example 5 with CorpusLine

use of com.joliciel.talismane.corpus.CorpusLine in project talismane by joliciel-informatique.

the class TokenRegexBasedCorpusReader method processSentence.

@Override
protected void processSentence(Sentence sentence, List<CorpusLine> corpusLines) throws TalismaneException, IOException {
    try {
        super.processSentence(sentence, corpusLines);
        tokenSequence = new PretokenisedSequence(sentence, sessionId);
        for (CorpusLine corpusLine : corpusLines) {
            this.convertToToken(tokenSequence, corpusLine);
        }
        for (TokenFilter filter : filters) filter.apply(tokenSequence);
        tokenSequence.cleanSlate();
    } catch (TalismaneException e) {
        this.clearSentence();
        throw e;
    }
}
Also used : TalismaneException(com.joliciel.talismane.TalismaneException) CorpusLine(com.joliciel.talismane.corpus.CorpusLine) TokenFilter(com.joliciel.talismane.tokeniser.filters.TokenFilter)

Aggregations

CorpusLine (com.joliciel.talismane.corpus.CorpusLine)5 TalismaneException (com.joliciel.talismane.TalismaneException)4 PosTaggedToken (com.joliciel.talismane.posTagger.PosTaggedToken)2 TalismaneTest (com.joliciel.talismane.TalismaneTest)1 CorpusElement (com.joliciel.talismane.corpus.CorpusLine.CorpusElement)1 DependencyArc (com.joliciel.talismane.parser.DependencyArc)1 ParseConfiguration (com.joliciel.talismane.parser.ParseConfiguration)1 ParserRegexBasedCorpusReader (com.joliciel.talismane.parser.ParserRegexBasedCorpusReader)1 TokenFilter (com.joliciel.talismane.tokeniser.filters.TokenFilter)1 Config (com.typesafe.config.Config)1 StringReader (java.io.StringReader)1 StringWriter (java.io.StringWriter)1 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 TreeSet (java.util.TreeSet)1 Pattern (java.util.regex.Pattern)1 Test (org.junit.Test)1