Search in sources :

Example 1 with CorpusElement

use of com.joliciel.talismane.corpus.CorpusLine.CorpusElement in project talismane by joliciel-informatique.

the class CorpusRule method apply.

/**
 * Apply the rule to a corpus line, and add the values that need to be updated
 * to the values map, as long as no value yet exists for the element being
 * updated. Thus, the first rule to match that updates a given element will
 * win, for each element.
 */
public void apply(CorpusLine corpusLine, Map<CorpusElement, String> values) {
    boolean match = true;
    for (CorpusElement element : criteria.keySet()) {
        if (corpusLine.hasElement(element)) {
            Pattern pattern = criteria.get(element);
            if (!pattern.matcher(corpusLine.getElement(element)).matches()) {
                match = false;
                break;
            }
        } else {
            match = false;
            break;
        }
    }
    if (match) {
        if (LOG.isTraceEnabled()) {
            LOG.trace("Rule " + this.toString() + " matched line " + corpusLine);
        }
        for (CorpusElement element : actions.keySet()) {
            if (!values.containsKey(element)) {
                String value = actions.get(element);
                values.put(element, value);
            }
        }
    }
}
Also used : CorpusElement(com.joliciel.talismane.corpus.CorpusLine.CorpusElement) Pattern(java.util.regex.Pattern)

Example 2 with CorpusElement

use of com.joliciel.talismane.corpus.CorpusLine.CorpusElement in project talismane by joliciel-informatique.

the class ParseOutputRewriter method getCorpusLines.

List<CorpusLine> getCorpusLines(ParseConfiguration parseConfiguration) throws TalismaneException {
    // first convert the parse configuration to a list of corpus lines
    List<CorpusLine> corpusLines = new ArrayList<>();
    for (PosTaggedToken posTaggedToken : parseConfiguration.getPosTagSequence()) {
        if (!posTaggedToken.isRoot()) {
            DependencyArc arc = parseConfiguration.getGoverningDependency(posTaggedToken);
            DependencyArc nonProjArc = parseConfiguration.getGoverningDependency(posTaggedToken, false);
            String line = posTaggedToken.getIndex() + "\t" + posTaggedToken.getToken().getOriginalText() + "\t" + posTaggedToken.getLemmaForCoNLL() + "\t" + posTaggedToken.getTag().getCode() + "\t" + posTaggedToken.getTag().getCode() + "\t" + posTaggedToken.getMorphologyForCoNLL() + "\t" + (arc != null ? arc.getHead().getIndex() : 0) + "\t" + (arc != null ? arc.getLabel() : "_");
            CorpusLine corpusLine = new CorpusLine(line, posTaggedToken.getToken().getLineNumber());
            corpusLine.setIndex(posTaggedToken.getIndex());
            corpusLine.setToken(posTaggedToken.getToken().getOriginalText());
            corpusLine.setLemma(posTaggedToken.getLemmaForCoNLL());
            corpusLine.setPosTag(posTaggedToken.getTag().getCode());
            String morphology = posTaggedToken.getMorphologyForCoNLL();
            corpusLine.setMorphology(morphology.length() == 0 ? "_" : morphology);
            corpusLine.setGovernorIndex(arc != null ? arc.getHead().getIndex() : 0);
            corpusLine.setLabel(arc != null ? arc.getLabel() : "_");
            corpusLine.setNonProjGovernorIndex(nonProjArc != null ? nonProjArc.getHead().getIndex() : 0);
            corpusLine.setNonProjLabel(nonProjArc != null ? nonProjArc.getLabel() : "_");
            if (posTaggedToken.getToken().getPrecedingRawOutput() != null)
                corpusLine.setElement(CorpusElement.PRECEDING_RAW_OUTPUT, posTaggedToken.getToken().getPrecedingRawOutput());
            if (posTaggedToken.getToken().getTrailingRawOutput() != null)
                corpusLine.setElement(CorpusElement.TRAILING_RAW_OUTPUT, posTaggedToken.getToken().getTrailingRawOutput());
            corpusLine.setTokenProbability(posTaggedToken.getToken().getProbability());
            corpusLine.setPosTagProbability(posTaggedToken.getProbability());
            if (arc != null)
                corpusLine.setParseProbability(arc.getProbability());
            corpusLines.add(corpusLine);
        }
    }
    Map<CorpusLine, SplitAction> splitActions = new HashMap<>();
    for (CorpusLine corpusLine : corpusLines) {
        if (LOG.isDebugEnabled())
            LOG.debug(corpusLine.toString());
        for (RewriteRule rewriteRule : rewriteRules) {
            boolean matches = true;
            conditionLoop: for (CorpusElement corpusElement : rewriteRule.conditions.keySet()) {
                Pattern pattern = rewriteRule.conditions.get(corpusElement);
                if (LOG.isTraceEnabled())
                    LOG.trace("For " + corpusElement.name() + ", matching " + pattern.pattern());
                switch(corpusElement) {
                    case POSTAG:
                        if (!pattern.matcher(corpusLine.getPosTag()).matches()) {
                            if (LOG.isTraceEnabled())
                                LOG.trace("Match failed for " + corpusLine.getPosTag());
                            matches = false;
                            break conditionLoop;
                        }
                        break;
                    case TOKEN:
                        if (!pattern.matcher(corpusLine.getToken()).matches()) {
                            matches = false;
                            break conditionLoop;
                        }
                        break;
                    case LEMMA:
                        if (!pattern.matcher(corpusLine.getLemma()).matches()) {
                            matches = false;
                            break conditionLoop;
                        }
                        break;
                    case LABEL:
                        if (!pattern.matcher(corpusLine.getLabel()).matches()) {
                            matches = false;
                            break conditionLoop;
                        }
                        break;
                    default:
                        throw new TalismaneException(ParseOutputRewriter.class.getSimpleName() + " cannot match on " + corpusElement.name());
                }
            }
            if (matches) {
                if (rewriteRule.action instanceof SplitAction) {
                    SplitAction splitAction = (SplitAction) rewriteRule.action;
                    splitActions.put(corpusLine, splitAction);
                }
            }
        }
    }
    if (splitActions.size() > 0) {
        List<CorpusLine> newCorpusLines = new ArrayList<>();
        Map<Integer, Integer> oldToNewIndexMap = new HashMap<>();
        oldToNewIndexMap.put(0, 0);
        int currentIndex = 1;
        for (int i = 0; i < corpusLines.size(); i++) {
            CorpusLine corpusLine = corpusLines.get(i);
            oldToNewIndexMap.put(i + 1, currentIndex);
            if (splitActions.containsKey(corpusLine)) {
                SplitAction splitAction = splitActions.get(corpusLine);
                currentIndex += splitAction.elementValues.size();
            } else {
                currentIndex++;
            }
        }
        for (int i = 0; i < corpusLines.size(); i++) {
            CorpusLine corpusLine = corpusLines.get(i);
            CorpusLine newCorpusLine = corpusLine.cloneCorpusLine();
            newCorpusLine.setIndex(oldToNewIndexMap.get(corpusLine.getIndex()));
            newCorpusLine.setGovernorIndex(oldToNewIndexMap.get(corpusLine.getGovernorIndex()));
            newCorpusLine.setNonProjGovernorIndex(oldToNewIndexMap.get(corpusLine.getNonProjGovernorIndex()));
            if (splitActions.containsKey(corpusLine)) {
                SplitAction splitAction = splitActions.get(corpusLine);
                for (int j = 0; j < splitAction.elementValues.size(); j++) {
                    CorpusLine splitCorpusLine = new CorpusLine(corpusLine.getLine(), corpusLine.getLineNumber());
                    splitCorpusLine.setIndex(oldToNewIndexMap.get(corpusLine.getIndex()) + j);
                    Map<CorpusElement, String> elementValues = splitAction.elementValues.get(j);
                    this.setElementValues(elementValues, oldToNewIndexMap, newCorpusLine, splitCorpusLine);
                    // The first matching element in each group will be applied
                    // The default element marks the end of each group, and will be
                    // applied if no other match has applied.
                    List<ConditionalAction> conditionalActions = splitAction.conditionalValues.get(j);
                    boolean groupHasMatch = false;
                    for (ConditionalAction conditionalAction : conditionalActions) {
                        CorpusLine baseLine = corpusLines.get(i + conditionalAction.relativeIndex);
                        if (conditionalAction.isDefault) {
                            if (!groupHasMatch) {
                                Map<CorpusElement, String> conditionalElementValues = conditionalAction.elementValues;
                                this.setElementValues(conditionalElementValues, oldToNewIndexMap, newCorpusLine, splitCorpusLine);
                            }
                            // The default action marks the end of each matching group.
                            groupHasMatch = false;
                        } else {
                            boolean match = true;
                            for (CorpusElement corpusElement : conditionalAction.conditions.keySet()) {
                                String origValue = baseLine.getElement(corpusElement);
                                Pattern pattern = conditionalAction.conditions.get(corpusElement);
                                if (!pattern.matcher(origValue).matches()) {
                                    match = false;
                                    break;
                                }
                            }
                            if (match) {
                                Map<CorpusElement, String> conditionalElementValues = conditionalAction.elementValues;
                                this.setElementValues(conditionalElementValues, oldToNewIndexMap, newCorpusLine, splitCorpusLine);
                                groupHasMatch = true;
                            }
                        // did this action match?
                        }
                    // default action?
                    }
                    // next conditional action
                    newCorpusLines.add(splitCorpusLine);
                }
            // next split
            } else {
                newCorpusLines.add(newCorpusLine);
            }
        // should line be split?
        }
        // next corpus line
        corpusLines = newCorpusLines;
    }
    return corpusLines;
}
Also used : CorpusElement(com.joliciel.talismane.corpus.CorpusLine.CorpusElement) Pattern(java.util.regex.Pattern) PosTaggedToken(com.joliciel.talismane.posTagger.PosTaggedToken) HashMap(java.util.HashMap) TalismaneException(com.joliciel.talismane.TalismaneException) ArrayList(java.util.ArrayList) CorpusLine(com.joliciel.talismane.corpus.CorpusLine) DependencyArc(com.joliciel.talismane.parser.DependencyArc)

Example 3 with CorpusElement

use of com.joliciel.talismane.corpus.CorpusLine.CorpusElement in project talismane by joliciel-informatique.

the class ParseOutputRewriter method setElementValues.

private void setElementValues(Map<CorpusElement, String> elementValues, Map<Integer, Integer> oldToNewIndexMap, CorpusLine origLine, CorpusLine splitCorpusLine) throws TalismaneException {
    for (CorpusElement key : elementValues.keySet()) {
        String elementValue = elementValues.get(key);
        if (elementValue.equals("${orig}")) {
            splitCorpusLine.setElement(key, origLine.getElement(key));
        } else {
            Matcher matcher = linePattern.matcher(elementValue);
            if (matcher.matches()) {
                int lineNumber = Integer.parseInt(matcher.group(1));
                int equivalentIndex = (origLine.getIndex() + lineNumber) - 1;
                switch(key) {
                    case GOVERNOR:
                        splitCorpusLine.setGovernorIndex(equivalentIndex);
                        break;
                    case NON_PROJ_GOVERNOR:
                        splitCorpusLine.setNonProjGovernorIndex(equivalentIndex);
                        break;
                    default:
                        throw new TalismaneException("element value '" + elementValue + "' not supported for corpus element " + key.name());
                }
            } else {
                splitCorpusLine.setElement(key, elementValue);
            }
        }
    }
}
Also used : CorpusElement(com.joliciel.talismane.corpus.CorpusLine.CorpusElement) Matcher(java.util.regex.Matcher) TalismaneException(com.joliciel.talismane.TalismaneException)

Example 4 with CorpusElement

use of com.joliciel.talismane.corpus.CorpusLine.CorpusElement in project talismane by joliciel-informatique.

the class CorpusLineReader method read.

/**
 * Read one line out of the corpus, and transform it into a {@link CorpusLine}
 *
 * @param line
 *          the line to read
 * @param lineNumber
 *          the line number we reached, starting at 1.
 * @throws TalismaneException
 *           if the regex wasn't matched on a given line
 */
public CorpusLine read(String line, int lineNumber) throws TalismaneException {
    Matcher matcher = this.pattern.matcher(line);
    if (!matcher.matches())
        throw new TalismaneException("Didn't match pattern \"" + regex + "\". Compiled to: \"" + this.pattern.pattern() + "\". On line " + lineNumber + ": " + line);
    CorpusLine corpusLine = new CorpusLine(line, lineNumber);
    for (CorpusElement elementType : CorpusElement.values()) {
        if (placeholderIndexMap.containsKey(elementType)) {
            String value = matcher.group(placeholderIndexMap.get(elementType));
            switch(elementType) {
                case TOKEN:
                case LEMMA:
                    value = TalismaneSession.get(sessionId).getCoNLLFormatter().fromCoNLL(value);
                    break;
                default:
                    if ("_".equals(value))
                        value = "";
                    break;
            }
            corpusLine.setElement(elementType, value);
        }
    }
    if (this.lexicalEntryReader != null) {
        WritableLexicalEntry lexicalEntry = new CompactLexicalEntry(lexicalEntrySupport);
        this.lexicalEntryReader.readEntry(line, lexicalEntry);
        corpusLine.setLexicalEntry(lexicalEntry);
    }
    Map<CorpusElement, String> updateValues = new HashMap<>();
    for (CorpusRule corpusRule : corpusRules) {
        corpusRule.apply(corpusLine, updateValues);
    }
    for (CorpusElement element : updateValues.keySet()) {
        String value = updateValues.get(element);
        if (LOG.isTraceEnabled()) {
            LOG.trace("On line " + lineNumber + ", updating " + element.name() + " from '" + corpusLine.getElement(element) + "' to '" + value + "'");
        }
        corpusLine.setElement(element, value);
    }
    return corpusLine;
}
Also used : CorpusElement(com.joliciel.talismane.corpus.CorpusLine.CorpusElement) Matcher(java.util.regex.Matcher) TalismaneException(com.joliciel.talismane.TalismaneException) HashMap(java.util.HashMap) CompactLexicalEntry(com.joliciel.talismane.lexicon.CompactLexicalEntry) WritableLexicalEntry(com.joliciel.talismane.lexicon.WritableLexicalEntry)

Aggregations

CorpusElement (com.joliciel.talismane.corpus.CorpusLine.CorpusElement)4 TalismaneException (com.joliciel.talismane.TalismaneException)3 HashMap (java.util.HashMap)2 Matcher (java.util.regex.Matcher)2 Pattern (java.util.regex.Pattern)2 CorpusLine (com.joliciel.talismane.corpus.CorpusLine)1 CompactLexicalEntry (com.joliciel.talismane.lexicon.CompactLexicalEntry)1 WritableLexicalEntry (com.joliciel.talismane.lexicon.WritableLexicalEntry)1 DependencyArc (com.joliciel.talismane.parser.DependencyArc)1 PosTaggedToken (com.joliciel.talismane.posTagger.PosTaggedToken)1 ArrayList (java.util.ArrayList)1