Search in sources :

Example 11 with TalismaneException

use of com.joliciel.talismane.TalismaneException in project talismane by joliciel-informatique.

the class PosTaggerFeatureParser method getRules.

/**
 * @param ruleDescriptors
 * @return
 * @throws TalismaneException
 *           if a rule is incorrectly configured
 */
public List<PosTaggerRule> getRules(List<String> ruleDescriptors) throws TalismaneException {
    List<PosTaggerRule> rules = new ArrayList<PosTaggerRule>();
    FunctionDescriptorParser descriptorParser = new FunctionDescriptorParser();
    for (String ruleDescriptor : ruleDescriptors) {
        LOG.debug(ruleDescriptor);
        if (ruleDescriptor.length() > 0 && !ruleDescriptor.startsWith("#")) {
            String[] ruleParts = ruleDescriptor.split("\t");
            String posTagCode = ruleParts[0];
            PosTag posTag = null;
            boolean negative = false;
            String descriptor = null;
            String descriptorName = null;
            if (ruleParts.length > 2) {
                descriptor = ruleParts[2];
                descriptorName = ruleParts[1];
            } else {
                descriptor = ruleParts[1];
            }
            if (posTagCode.length() == 0) {
                if (descriptorName == null) {
                    throw new TalismaneException("Rule without PosTag must have a name.");
                }
            } else {
                if (posTagCode.startsWith("!")) {
                    negative = true;
                    posTagCode = posTagCode.substring(1);
                }
                posTag = TalismaneSession.get(sessionId).getPosTagSet().getPosTag(posTagCode);
            }
            FunctionDescriptor functionDescriptor = descriptorParser.parseDescriptor(descriptor);
            if (descriptorName != null)
                functionDescriptor.setDescriptorName(descriptorName);
            List<PosTaggerFeature<?>> myFeatures = this.parseDescriptor(functionDescriptor);
            if (posTag != null) {
                for (PosTaggerFeature<?> feature : myFeatures) {
                    if (feature instanceof BooleanFeature) {
                        @SuppressWarnings("unchecked") BooleanFeature<PosTaggerContext> condition = (BooleanFeature<PosTaggerContext>) feature;
                        PosTaggerRule rule = new PosTaggerRule(condition, posTag);
                        rule.setNegative(negative);
                        rules.add(rule);
                    } else {
                        throw new TalismaneException("Rule must be based on a boolean feature.");
                    }
                }
            // next feature
            }
        // is it a rule, or just a descriptor
        }
    // proper rule descriptor
    }
    // next rule descriptor
    return rules;
}
Also used : TalismaneException(com.joliciel.talismane.TalismaneException) ArrayList(java.util.ArrayList) FunctionDescriptorParser(com.joliciel.talismane.machineLearning.features.FunctionDescriptorParser) FunctionDescriptor(com.joliciel.talismane.machineLearning.features.FunctionDescriptor) PosTaggerContext(com.joliciel.talismane.posTagger.PosTaggerContext) BooleanFeature(com.joliciel.talismane.machineLearning.features.BooleanFeature) PosTag(com.joliciel.talismane.posTagger.PosTag)

Example 12 with TalismaneException

use of com.joliciel.talismane.TalismaneException in project talismane by joliciel-informatique.

the class PosTaggerHistoryAddressFunction method checkInternal.

@Override
protected FeatureResult<PosTaggedTokenWrapper> checkInternal(PosTaggerContext context, RuntimeEnvironment env) throws TalismaneException {
    FeatureResult<PosTaggedTokenWrapper> result = null;
    FeatureResult<Integer> offsetResult = offsetFeature.check(context, env);
    if (offsetResult != null) {
        int n = offsetResult.getOutcome();
        if (n >= 0) {
            throw new TalismaneException("Cannot call PosTaggerHistoryFeature with an offset >= 0");
        }
        n = 0 - n;
        int i = context.getToken().getIndex();
        if (i >= n) {
            PosTaggedToken prevToken = context.getHistory().get(i - n);
            if (prevToken != null)
                result = this.generateResult(prevToken);
        }
    }
    // have n
    return result;
}
Also used : PosTaggedToken(com.joliciel.talismane.posTagger.PosTaggedToken) TalismaneException(com.joliciel.talismane.TalismaneException)

Example 13 with TalismaneException

use of com.joliciel.talismane.TalismaneException in project talismane by joliciel-informatique.

the class PosTagSequenceProcessor method getProcessors.

/**
 * Collect the processors specified in the configuration key
 * talismane.core.[sessionId].pos-tagger.output.processors.<br>
 * <br>
 * Each processor must implement this interface and must have a constructor
 * matching one of the following signatures:<br>
 * - ( {@link File} outputDir, {@link String} sessionId)<br>
 * - ( {@link String} sessionId)<br>
 * <br>
 * Optionally, it can have a constructor with the following signature:<br>
 * - ( {@link Writer} writer, {@link String} sessionId)<br>
 * If a writer is provided here, then the first processor with the above
 * constructor will be given the writer.
 *
 * @param writer
 *          if specified, will be used for the first processor in the list
 *          with a writer in the constructor
 * @param outDir
 *          directory in which to write the various outputs
 * @return
 * @throws IOException
 * @throws TalismaneException
 *           if a processor does not implement this interface, or if no
 *           constructor is found with the correct signature
 */
public static List<PosTagSequenceProcessor> getProcessors(Writer writer, File outDir, String sessionId) throws IOException, ReflectiveOperationException, ClassNotFoundException, TalismaneException {
    Config config = ConfigFactory.load();
    Config myConfig = config.getConfig("talismane.core." + sessionId + ".pos-tagger");
    List<PosTagSequenceProcessor> processors = new ArrayList<>();
    List<String> classes = myConfig.getStringList("output.processors");
    if (outDir != null)
        outDir.mkdirs();
    Writer firstProcessorWriter = writer;
    for (String className : classes) {
        @SuppressWarnings("rawtypes") Class untypedClass = Class.forName(className);
        if (!PosTagSequenceProcessor.class.isAssignableFrom(untypedClass))
            throw new TalismaneException("Class " + className + " does not implement interface " + PosTagSequenceProcessor.class.getSimpleName());
        @SuppressWarnings("unchecked") Class<? extends PosTagSequenceProcessor> clazz = untypedClass;
        Constructor<? extends PosTagSequenceProcessor> cons = null;
        PosTagSequenceProcessor processor = null;
        if (firstProcessorWriter != null) {
            try {
                cons = clazz.getConstructor(Writer.class, String.class);
            } catch (NoSuchMethodException e) {
            // do nothing
            }
            if (cons != null) {
                processor = cons.newInstance(firstProcessorWriter, sessionId);
                firstProcessorWriter = null;
            }
        }
        if (cons == null) {
            try {
                cons = clazz.getConstructor(File.class, String.class);
            } catch (NoSuchMethodException e) {
            // do nothing
            }
            if (cons != null) {
                processor = cons.newInstance(outDir, sessionId);
            }
        }
        if (cons == null) {
            try {
                cons = clazz.getConstructor(String.class);
            } catch (NoSuchMethodException e) {
            // do nothing
            }
            if (cons != null) {
                processor = cons.newInstance(sessionId);
            } else {
                throw new TalismaneException("No constructor found with correct signature for: " + className);
            }
        }
        processors.add(processor);
    }
    return processors;
}
Also used : TalismaneException(com.joliciel.talismane.TalismaneException) Config(com.typesafe.config.Config) ArrayList(java.util.ArrayList) File(java.io.File) Writer(java.io.Writer)

Example 14 with TalismaneException

use of com.joliciel.talismane.TalismaneException in project talismane by joliciel-informatique.

the class ParseConfigurationProcessor method getProcessors.

/**
 * Collect the processors specified in the configuration key
 * talismane.core.[sessionId].parser.output.processors.<br>
 * <br>
 * Each processor must implement this interface and must have a constructor
 * matching one of the following signatures:<br>
 * - ( {@link File} outputDir, {@link String} sessionId)<br>
 * - ( {@link String} sessionId)<br>
 * <br>
 * Optionally, it can have a constructor with the following signature:<br>
 * - ( {@link Writer} writer, {@link String} sessionId)<br>
 * If a writer is provided here, then the first processor with the above
 * constructor will be given the writer.
 *
 * @param writer
 *          if specified, will be used for the first processor in the list
 *          with a writer in the constructor
 * @param outDir
 *          directory in which to write the various outputs
 * @return
 * @throws IOException
 * @throws TalismaneException
 *           if a processor does not implement this interface, or if no
 *           constructor is found with the correct signature
 */
public static List<ParseConfigurationProcessor> getProcessors(Writer writer, File outDir, String sessionId) throws IOException, ReflectiveOperationException, ClassNotFoundException, TalismaneException {
    Config config = ConfigFactory.load();
    Config parserConfig = config.getConfig("talismane.core." + sessionId + ".parser");
    List<ParseConfigurationProcessor> processors = new ArrayList<>();
    List<String> classes = parserConfig.getStringList("output.processors");
    if (outDir != null)
        outDir.mkdirs();
    Writer firstProcessorWriter = writer;
    for (String className : classes) {
        @SuppressWarnings("rawtypes") Class untypedClass = Class.forName(className);
        if (!ParseConfigurationProcessor.class.isAssignableFrom(untypedClass))
            throw new TalismaneException("Class " + className + " does not implement interface " + ParseConfigurationProcessor.class.getSimpleName());
        @SuppressWarnings("unchecked") Class<? extends ParseConfigurationProcessor> clazz = untypedClass;
        Constructor<? extends ParseConfigurationProcessor> cons = null;
        ParseConfigurationProcessor processor = null;
        if (firstProcessorWriter != null) {
            try {
                cons = clazz.getConstructor(Writer.class, String.class);
            } catch (NoSuchMethodException e) {
            // do nothing
            }
            if (cons != null) {
                processor = cons.newInstance(firstProcessorWriter, sessionId);
                firstProcessorWriter = null;
            }
        }
        if (cons == null) {
            try {
                cons = clazz.getConstructor(File.class, String.class);
            } catch (NoSuchMethodException e) {
            // do nothing
            }
            if (cons != null) {
                processor = cons.newInstance(outDir, sessionId);
            }
        }
        if (cons == null) {
            try {
                cons = clazz.getConstructor(String.class);
            } catch (NoSuchMethodException e) {
            // do nothing
            }
            if (cons != null) {
                processor = cons.newInstance(sessionId);
            } else {
                throw new TalismaneException("No constructor found with correct signature for: " + className);
            }
        }
        processors.add(processor);
    }
    return processors;
}
Also used : TalismaneException(com.joliciel.talismane.TalismaneException) Config(com.typesafe.config.Config) ArrayList(java.util.ArrayList) File(java.io.File) Writer(java.io.Writer)

Example 15 with TalismaneException

use of com.joliciel.talismane.TalismaneException in project talismane by joliciel-informatique.

the class ParseOutputRewriter method getCorpusLines.

List<CorpusLine> getCorpusLines(ParseConfiguration parseConfiguration) throws TalismaneException {
    // first convert the parse configuration to a list of corpus lines
    List<CorpusLine> corpusLines = new ArrayList<>();
    for (PosTaggedToken posTaggedToken : parseConfiguration.getPosTagSequence()) {
        if (!posTaggedToken.isRoot()) {
            DependencyArc arc = parseConfiguration.getGoverningDependency(posTaggedToken);
            DependencyArc nonProjArc = parseConfiguration.getGoverningDependency(posTaggedToken, false);
            String line = posTaggedToken.getIndex() + "\t" + posTaggedToken.getToken().getOriginalText() + "\t" + posTaggedToken.getLemmaForCoNLL() + "\t" + posTaggedToken.getTag().getCode() + "\t" + posTaggedToken.getTag().getCode() + "\t" + posTaggedToken.getMorphologyForCoNLL() + "\t" + (arc != null ? arc.getHead().getIndex() : 0) + "\t" + (arc != null ? arc.getLabel() : "_");
            CorpusLine corpusLine = new CorpusLine(line, posTaggedToken.getToken().getLineNumber());
            corpusLine.setIndex(posTaggedToken.getIndex());
            corpusLine.setToken(posTaggedToken.getToken().getOriginalText());
            corpusLine.setLemma(posTaggedToken.getLemmaForCoNLL());
            corpusLine.setPosTag(posTaggedToken.getTag().getCode());
            String morphology = posTaggedToken.getMorphologyForCoNLL();
            corpusLine.setMorphology(morphology.length() == 0 ? "_" : morphology);
            corpusLine.setGovernorIndex(arc != null ? arc.getHead().getIndex() : 0);
            corpusLine.setLabel(arc != null ? arc.getLabel() : "_");
            corpusLine.setNonProjGovernorIndex(nonProjArc != null ? nonProjArc.getHead().getIndex() : 0);
            corpusLine.setNonProjLabel(nonProjArc != null ? nonProjArc.getLabel() : "_");
            if (posTaggedToken.getToken().getPrecedingRawOutput() != null)
                corpusLine.setElement(CorpusElement.PRECEDING_RAW_OUTPUT, posTaggedToken.getToken().getPrecedingRawOutput());
            if (posTaggedToken.getToken().getTrailingRawOutput() != null)
                corpusLine.setElement(CorpusElement.TRAILING_RAW_OUTPUT, posTaggedToken.getToken().getTrailingRawOutput());
            corpusLine.setTokenProbability(posTaggedToken.getToken().getProbability());
            corpusLine.setPosTagProbability(posTaggedToken.getProbability());
            if (arc != null)
                corpusLine.setParseProbability(arc.getProbability());
            corpusLines.add(corpusLine);
        }
    }
    Map<CorpusLine, SplitAction> splitActions = new HashMap<>();
    for (CorpusLine corpusLine : corpusLines) {
        if (LOG.isDebugEnabled())
            LOG.debug(corpusLine.toString());
        for (RewriteRule rewriteRule : rewriteRules) {
            boolean matches = true;
            conditionLoop: for (CorpusElement corpusElement : rewriteRule.conditions.keySet()) {
                Pattern pattern = rewriteRule.conditions.get(corpusElement);
                if (LOG.isTraceEnabled())
                    LOG.trace("For " + corpusElement.name() + ", matching " + pattern.pattern());
                switch(corpusElement) {
                    case POSTAG:
                        if (!pattern.matcher(corpusLine.getPosTag()).matches()) {
                            if (LOG.isTraceEnabled())
                                LOG.trace("Match failed for " + corpusLine.getPosTag());
                            matches = false;
                            break conditionLoop;
                        }
                        break;
                    case TOKEN:
                        if (!pattern.matcher(corpusLine.getToken()).matches()) {
                            matches = false;
                            break conditionLoop;
                        }
                        break;
                    case LEMMA:
                        if (!pattern.matcher(corpusLine.getLemma()).matches()) {
                            matches = false;
                            break conditionLoop;
                        }
                        break;
                    case LABEL:
                        if (!pattern.matcher(corpusLine.getLabel()).matches()) {
                            matches = false;
                            break conditionLoop;
                        }
                        break;
                    default:
                        throw new TalismaneException(ParseOutputRewriter.class.getSimpleName() + " cannot match on " + corpusElement.name());
                }
            }
            if (matches) {
                if (rewriteRule.action instanceof SplitAction) {
                    SplitAction splitAction = (SplitAction) rewriteRule.action;
                    splitActions.put(corpusLine, splitAction);
                }
            }
        }
    }
    if (splitActions.size() > 0) {
        List<CorpusLine> newCorpusLines = new ArrayList<>();
        Map<Integer, Integer> oldToNewIndexMap = new HashMap<>();
        oldToNewIndexMap.put(0, 0);
        int currentIndex = 1;
        for (int i = 0; i < corpusLines.size(); i++) {
            CorpusLine corpusLine = corpusLines.get(i);
            oldToNewIndexMap.put(i + 1, currentIndex);
            if (splitActions.containsKey(corpusLine)) {
                SplitAction splitAction = splitActions.get(corpusLine);
                currentIndex += splitAction.elementValues.size();
            } else {
                currentIndex++;
            }
        }
        for (int i = 0; i < corpusLines.size(); i++) {
            CorpusLine corpusLine = corpusLines.get(i);
            CorpusLine newCorpusLine = corpusLine.cloneCorpusLine();
            newCorpusLine.setIndex(oldToNewIndexMap.get(corpusLine.getIndex()));
            newCorpusLine.setGovernorIndex(oldToNewIndexMap.get(corpusLine.getGovernorIndex()));
            newCorpusLine.setNonProjGovernorIndex(oldToNewIndexMap.get(corpusLine.getNonProjGovernorIndex()));
            if (splitActions.containsKey(corpusLine)) {
                SplitAction splitAction = splitActions.get(corpusLine);
                for (int j = 0; j < splitAction.elementValues.size(); j++) {
                    CorpusLine splitCorpusLine = new CorpusLine(corpusLine.getLine(), corpusLine.getLineNumber());
                    splitCorpusLine.setIndex(oldToNewIndexMap.get(corpusLine.getIndex()) + j);
                    Map<CorpusElement, String> elementValues = splitAction.elementValues.get(j);
                    this.setElementValues(elementValues, oldToNewIndexMap, newCorpusLine, splitCorpusLine);
                    // The first matching element in each group will be applied
                    // The default element marks the end of each group, and will be
                    // applied if no other match has applied.
                    List<ConditionalAction> conditionalActions = splitAction.conditionalValues.get(j);
                    boolean groupHasMatch = false;
                    for (ConditionalAction conditionalAction : conditionalActions) {
                        CorpusLine baseLine = corpusLines.get(i + conditionalAction.relativeIndex);
                        if (conditionalAction.isDefault) {
                            if (!groupHasMatch) {
                                Map<CorpusElement, String> conditionalElementValues = conditionalAction.elementValues;
                                this.setElementValues(conditionalElementValues, oldToNewIndexMap, newCorpusLine, splitCorpusLine);
                            }
                            // The default action marks the end of each matching group.
                            groupHasMatch = false;
                        } else {
                            boolean match = true;
                            for (CorpusElement corpusElement : conditionalAction.conditions.keySet()) {
                                String origValue = baseLine.getElement(corpusElement);
                                Pattern pattern = conditionalAction.conditions.get(corpusElement);
                                if (!pattern.matcher(origValue).matches()) {
                                    match = false;
                                    break;
                                }
                            }
                            if (match) {
                                Map<CorpusElement, String> conditionalElementValues = conditionalAction.elementValues;
                                this.setElementValues(conditionalElementValues, oldToNewIndexMap, newCorpusLine, splitCorpusLine);
                                groupHasMatch = true;
                            }
                        // did this action match?
                        }
                    // default action?
                    }
                    // next conditional action
                    newCorpusLines.add(splitCorpusLine);
                }
            // next split
            } else {
                newCorpusLines.add(newCorpusLine);
            }
        // should line be split?
        }
        // next corpus line
        corpusLines = newCorpusLines;
    }
    return corpusLines;
}
Also used : CorpusElement(com.joliciel.talismane.corpus.CorpusLine.CorpusElement) Pattern(java.util.regex.Pattern) PosTaggedToken(com.joliciel.talismane.posTagger.PosTaggedToken) HashMap(java.util.HashMap) TalismaneException(com.joliciel.talismane.TalismaneException) ArrayList(java.util.ArrayList) CorpusLine(com.joliciel.talismane.corpus.CorpusLine) DependencyArc(com.joliciel.talismane.parser.DependencyArc)

Aggregations

TalismaneException (com.joliciel.talismane.TalismaneException)47 ArrayList (java.util.ArrayList)27 Config (com.typesafe.config.Config)14 File (java.io.File)11 List (java.util.List)10 TreeSet (java.util.TreeSet)10 FeatureResult (com.joliciel.talismane.machineLearning.features.FeatureResult)9 IOException (java.io.IOException)9 HashMap (java.util.HashMap)9 Set (java.util.Set)9 Decision (com.joliciel.talismane.machineLearning.Decision)8 RuntimeEnvironment (com.joliciel.talismane.machineLearning.features.RuntimeEnvironment)8 PosTaggedToken (com.joliciel.talismane.posTagger.PosTaggedToken)8 Token (com.joliciel.talismane.tokeniser.Token)8 Map (java.util.Map)8 SortedSet (java.util.SortedSet)8 Collectors (java.util.stream.Collectors)8 Logger (org.slf4j.Logger)8 LoggerFactory (org.slf4j.LoggerFactory)8 Sentence (com.joliciel.talismane.rawText.Sentence)7