Search in sources :

Example 36 with TalismaneException

use of com.joliciel.talismane.TalismaneException in project talismane by joliciel-informatique.

the class DependencyNode method autoPopulate.

/**
 * Populate this node's dependents directly from the parse configuration.
 */
public void autoPopulate() {
    for (PosTaggedToken dependent : parseConfiguration.getDependents(this.token)) {
        DependencyNode childNode;
        try {
            childNode = this.addDependent(dependent);
        } catch (TalismaneException e) {
            // should never happen
            LOG.error(e.getMessage(), e);
            throw new RuntimeException(e);
        }
        childNode.autoPopulate();
    }
}
Also used : PosTaggedToken(com.joliciel.talismane.posTagger.PosTaggedToken) TalismaneException(com.joliciel.talismane.TalismaneException)

Example 37 with TalismaneException

use of com.joliciel.talismane.TalismaneException in project talismane by joliciel-informatique.

the class CorpusLineReader method read.

/**
 * Read one line out of the corpus, and transform it into a {@link CorpusLine}
 *
 * @param line
 *          the line to read
 * @param lineNumber
 *          the line number we reached, starting at 1.
 * @throws TalismaneException
 *           if the regex wasn't matched on a given line
 */
public CorpusLine read(String line, int lineNumber) throws TalismaneException {
    Matcher matcher = this.pattern.matcher(line);
    if (!matcher.matches())
        throw new TalismaneException("Didn't match pattern \"" + regex + "\". Compiled to: \"" + this.pattern.pattern() + "\". On line " + lineNumber + ": " + line);
    CorpusLine corpusLine = new CorpusLine(line, lineNumber);
    for (CorpusElement elementType : CorpusElement.values()) {
        if (placeholderIndexMap.containsKey(elementType)) {
            String value = matcher.group(placeholderIndexMap.get(elementType));
            switch(elementType) {
                case TOKEN:
                case LEMMA:
                    value = TalismaneSession.get(sessionId).getCoNLLFormatter().fromCoNLL(value);
                    break;
                default:
                    if ("_".equals(value))
                        value = "";
                    break;
            }
            corpusLine.setElement(elementType, value);
        }
    }
    if (this.lexicalEntryReader != null) {
        WritableLexicalEntry lexicalEntry = new CompactLexicalEntry(lexicalEntrySupport);
        this.lexicalEntryReader.readEntry(line, lexicalEntry);
        corpusLine.setLexicalEntry(lexicalEntry);
    }
    Map<CorpusElement, String> updateValues = new HashMap<>();
    for (CorpusRule corpusRule : corpusRules) {
        corpusRule.apply(corpusLine, updateValues);
    }
    for (CorpusElement element : updateValues.keySet()) {
        String value = updateValues.get(element);
        if (LOG.isTraceEnabled()) {
            LOG.trace("On line " + lineNumber + ", updating " + element.name() + " from '" + corpusLine.getElement(element) + "' to '" + value + "'");
        }
        corpusLine.setElement(element, value);
    }
    return corpusLine;
}
Also used : CorpusElement(com.joliciel.talismane.corpus.CorpusLine.CorpusElement) Matcher(java.util.regex.Matcher) TalismaneException(com.joliciel.talismane.TalismaneException) HashMap(java.util.HashMap) CompactLexicalEntry(com.joliciel.talismane.lexicon.CompactLexicalEntry) WritableLexicalEntry(com.joliciel.talismane.lexicon.WritableLexicalEntry)

Example 38 with TalismaneException

use of com.joliciel.talismane.TalismaneException in project talismane by joliciel-informatique.

the class LinearSVMModelTrainer method getFeatureMatrix.

private Feature[][] getFeatureMatrix(ClassificationEventStream corpusEventStream, TObjectIntMap<String> featureIndexMap, TObjectIntMap<String> outcomeIndexMap, TIntList outcomeList, TIntIntMap featureCountMap, CountingInfo countingInfo) {
    try {
        int maxFeatureCount = 0;
        List<Feature[]> fullFeatureList = new ArrayList<Feature[]>();
        while (corpusEventStream.hasNext()) {
            ClassificationEvent corpusEvent = corpusEventStream.next();
            int outcomeIndex = outcomeIndexMap.get(corpusEvent.getClassification());
            if (outcomeIndex < 0) {
                outcomeIndex = countingInfo.currentOutcomeIndex++;
                outcomeIndexMap.put(corpusEvent.getClassification(), outcomeIndex);
            }
            outcomeList.add(outcomeIndex);
            Map<Integer, Feature> featureList = new TreeMap<Integer, Feature>();
            for (FeatureResult<?> featureResult : corpusEvent.getFeatureResults()) {
                if (featureResult.getOutcome() instanceof List) {
                    @SuppressWarnings("unchecked") FeatureResult<List<WeightedOutcome<String>>> stringCollectionResult = (FeatureResult<List<WeightedOutcome<String>>>) featureResult;
                    for (WeightedOutcome<String> stringOutcome : stringCollectionResult.getOutcome()) {
                        String featureName = featureResult.getTrainingName() + "|" + featureResult.getTrainingOutcome(stringOutcome.getOutcome());
                        double value = stringOutcome.getWeight();
                        this.addFeatureResult(featureName, value, featureList, featureIndexMap, featureCountMap, countingInfo);
                    }
                } else {
                    double value = 1.0;
                    if (featureResult.getOutcome() instanceof Double) {
                        @SuppressWarnings("unchecked") FeatureResult<Double> doubleResult = (FeatureResult<Double>) featureResult;
                        value = doubleResult.getOutcome().doubleValue();
                    }
                    this.addFeatureResult(featureResult.getTrainingName(), value, featureList, featureIndexMap, featureCountMap, countingInfo);
                }
            }
            if (featureList.size() > maxFeatureCount)
                maxFeatureCount = featureList.size();
            // convert to array immediately, to avoid double storage
            int j = 0;
            Feature[] featureArray = new Feature[featureList.size()];
            for (Feature feature : featureList.values()) {
                featureArray[j] = feature;
                j++;
            }
            fullFeatureList.add(featureArray);
            countingInfo.numEvents++;
            if (countingInfo.numEvents % 1000 == 0) {
                LOG.debug("Processed " + countingInfo.numEvents + " events.");
            }
        }
        Feature[][] featureMatrix = new Feature[countingInfo.numEvents][];
        int i = 0;
        for (Feature[] featureArray : fullFeatureList) {
            featureMatrix[i] = featureArray;
            i++;
        }
        fullFeatureList = null;
        LOG.debug("Event count: " + countingInfo.numEvents);
        LOG.debug("Feature count: " + featureIndexMap.size());
        return featureMatrix;
    } catch (TalismaneException e) {
        LOG.error(e.getMessage(), e);
        throw new RuntimeException(e);
    } catch (IOException e) {
        LOG.error(e.getMessage(), e);
        throw new RuntimeException(e);
    }
}
Also used : TalismaneException(com.joliciel.talismane.TalismaneException) TIntArrayList(gnu.trove.list.array.TIntArrayList) ArrayList(java.util.ArrayList) WeightedOutcome(com.joliciel.talismane.utils.WeightedOutcome) IOException(java.io.IOException) TreeMap(java.util.TreeMap) Feature(de.bwaldvogel.liblinear.Feature) TIntArrayList(gnu.trove.list.array.TIntArrayList) ArrayList(java.util.ArrayList) TIntList(gnu.trove.list.TIntList) List(java.util.List) ClassificationEvent(com.joliciel.talismane.machineLearning.ClassificationEvent) FeatureResult(com.joliciel.talismane.machineLearning.features.FeatureResult)

Example 39 with TalismaneException

use of com.joliciel.talismane.TalismaneException in project talismane by joliciel-informatique.

the class CorpusProjectifier method onNextParseConfiguration.

@Override
public void onNextParseConfiguration(ParseConfiguration parseConfiguration) throws TalismaneException {
    List<DependencyArc> arcs = new ArrayList<DependencyArc>(parseConfiguration.getNonProjectiveDependencies());
    NonProjectivePair pair = this.getNextPair(arcs);
    if (pair != null) {
        // set so that it stays untouched
        for (DependencyArc arc : arcs) {
            parseConfiguration.addManualNonProjectiveDependency(arc.getHead(), arc.getDependent(), arc.getLabel());
        }
    }
    while (pair != null) {
        PosTaggedToken newHead1 = null;
        PosTaggedToken parent1 = parseConfiguration.getHead(pair.arc1.getHead());
        int depIndex1 = pair.arc1.getDependent().getToken().getIndex();
        int depthDelta1 = 1;
        while (parent1 != null) {
            int headIndex = parent1.getToken().getIndex();
            int startIndex = headIndex < depIndex1 ? headIndex : depIndex1;
            int endIndex = headIndex >= depIndex1 ? headIndex : depIndex1;
            if (isProjective(startIndex, endIndex, pair.arc2)) {
                newHead1 = parent1;
                break;
            }
            parent1 = parseConfiguration.getHead(parent1);
            depthDelta1++;
        }
        PosTaggedToken newHead2 = null;
        PosTaggedToken parent2 = parseConfiguration.getHead(pair.arc2.getHead());
        int depIndex2 = pair.arc2.getDependent().getToken().getIndex();
        int depthDelta2 = 1;
        while (parent2 != null) {
            int headIndex = parent2.getToken().getIndex();
            int startIndex = headIndex < depIndex2 ? headIndex : depIndex2;
            int endIndex = headIndex >= depIndex2 ? headIndex : depIndex2;
            if (isProjective(startIndex, endIndex, pair.arc2)) {
                newHead2 = parent2;
                break;
            }
            parent2 = parseConfiguration.getHead(parent2);
            depthDelta2++;
        }
        if (newHead1 != null && newHead2 != null) {
            int linearDistance1 = Math.abs(newHead1.getIndex() - depIndex1);
            int linearDistance2 = Math.abs(newHead2.getIndex() - depIndex2);
            int rootDepthDelta1 = 0;
            PosTaggedToken parent = parseConfiguration.getHead(newHead1);
            while (parent != null) {
                rootDepthDelta1++;
                parent = parseConfiguration.getHead(parent);
            }
            int rootDepthDelta2 = 0;
            parent = parseConfiguration.getHead(newHead2);
            while (parent != null) {
                rootDepthDelta2++;
                parent = parseConfiguration.getHead(parent);
            }
            switch(strategy) {
                case LeastLinearDistance:
                    if (linearDistance1 < linearDistance2) {
                        newHead2 = null;
                        break;
                    } else if (linearDistance2 < linearDistance1) {
                        newHead1 = null;
                        break;
                    }
                // break left out on purpose
                case LeastDepthDifference:
                    if (depthDelta1 < depthDelta2) {
                        newHead2 = null;
                        break;
                    } else if (depthDelta2 < depthDelta1) {
                        newHead1 = null;
                        break;
                    }
                // break left out on purpose
                case GreatestDepth:
                    if (rootDepthDelta1 < rootDepthDelta2) {
                        newHead1 = null;
                        break;
                    } else {
                        newHead2 = null;
                        break;
                    }
            }
        }
        if (newHead1 != null && newHead2 == null) {
            parseConfiguration.removeDependency(pair.arc1);
            String newLabel = pair.arc1.getLabel();
            if (this.nonProjectiveArcSuffix.length() > 0 && !newLabel.endsWith(this.nonProjectiveArcSuffix))
                newLabel += this.nonProjectiveArcSuffix;
            parseConfiguration.addDependency(newHead1, pair.arc1.getDependent(), newLabel, null);
            // for the other arc, copy the non-projective version, in case
            // there is an attempt at manual projectivisation
            DependencyArc otherProjArc = parseConfiguration.getGoverningDependency(pair.arc2.getDependent());
            parseConfiguration.removeDependency(otherProjArc);
            parseConfiguration.addDependency(pair.arc2.getHead(), pair.arc2.getDependent(), pair.arc2.getLabel(), null);
        } else if (newHead1 == null && newHead2 != null) {
            parseConfiguration.removeDependency(pair.arc2);
            String newLabel = pair.arc2.getLabel();
            if (this.nonProjectiveArcSuffix.length() > 0 && !newLabel.endsWith(this.nonProjectiveArcSuffix))
                newLabel += this.nonProjectiveArcSuffix;
            parseConfiguration.addDependency(newHead2, pair.arc2.getDependent(), newLabel, null);
            // for the other arc, copy the non-projective version, in case
            // there is an attempt at manual projectivisation
            DependencyArc otherProjArc = parseConfiguration.getGoverningDependency(pair.arc1.getDependent());
            parseConfiguration.removeDependency(otherProjArc);
            parseConfiguration.addDependency(pair.arc1.getHead(), pair.arc1.getDependent(), pair.arc1.getLabel(), null);
        } else {
            throw new TalismaneException("Cannot deprojectify " + pair + ". Could not find projective parents.");
        }
        parseConfiguration.clearMemory();
        arcs = new ArrayList<DependencyArc>(parseConfiguration.getDependencies());
        pair = this.getNextPair(arcs);
    }
}
Also used : PosTaggedToken(com.joliciel.talismane.posTagger.PosTaggedToken) TalismaneException(com.joliciel.talismane.TalismaneException) ArrayList(java.util.ArrayList) DependencyArc(com.joliciel.talismane.parser.DependencyArc)

Example 40 with TalismaneException

use of com.joliciel.talismane.TalismaneException in project talismane by joliciel-informatique.

the class StandoffReader method hasNextSentence.

@Override
public boolean hasNextSentence() throws TalismaneException, IOException {
    if (this.getMaxSentenceCount() > 0 && sentenceCount >= this.getMaxSentenceCount()) {
    // we've reached the end, do nothing
    } else {
        if (configuration == null && sentenceIndex < sentences.size()) {
            List<StandoffToken> tokens = sentences.get(sentenceIndex++);
            LinguisticRules rules = TalismaneSession.get(sessionId).getLinguisticRules();
            if (rules == null)
                throw new RuntimeException("Linguistic rules have not been set.");
            String text = "";
            for (StandoffToken standoffToken : tokens) {
                String word = standoffToken.text;
                if (rules.shouldAddSpace(text, word))
                    text += " ";
                text += word;
            }
            Sentence sentence = new Sentence(text, sessionId);
            for (SentenceAnnotator annotator : TalismaneSession.get(sessionId).getSentenceAnnotators()) {
                annotator.annotate(sentence);
            }
            PretokenisedSequence tokenSequence = new PretokenisedSequence(sentence, sessionId);
            PosTagSequence posTagSequence = new PosTagSequence(tokenSequence);
            Map<String, PosTaggedToken> idTokenMap = new HashMap<String, PosTaggedToken>();
            for (StandoffToken standoffToken : tokens) {
                Token token = tokenSequence.addToken(standoffToken.text);
                Decision posTagDecision = new Decision(standoffToken.posTag.getCode());
                PosTaggedToken posTaggedToken = new PosTaggedToken(token, posTagDecision, sessionId);
                if (LOG.isTraceEnabled()) {
                    LOG.trace(posTaggedToken.toString());
                }
                posTaggedToken.setComment(standoffToken.comment);
                posTagSequence.addPosTaggedToken(posTaggedToken);
                idTokenMap.put(standoffToken.id, posTaggedToken);
                LOG.debug("Found token " + standoffToken.id + ", " + posTaggedToken);
            }
            tokenSequence.setWithRoot(true);
            configuration = new ParseConfiguration(posTagSequence);
            for (StandoffToken standoffToken : tokens) {
                StandoffRelation relation = relationMap.get(standoffToken.id);
                if (relation != null) {
                    PosTaggedToken head = idTokenMap.get(relation.fromToken);
                    PosTaggedToken dependent = idTokenMap.get(relation.toToken);
                    if (head == null) {
                        throw new TalismaneException("No token found for head id: " + relation.fromToken);
                    }
                    if (dependent == null) {
                        throw new TalismaneException("No token found for dependent id: " + relation.toToken);
                    }
                    DependencyArc arc = configuration.addDependency(head, dependent, relation.label, null);
                    arc.setComment(relation.comment);
                } else if (standoffToken.posTag.getOpenClassIndicator() == PosTagOpenClassIndicator.PUNCTUATION) {
                    if (punctuationDepLabel != null) {
                        PosTaggedToken dependent = idTokenMap.get(standoffToken.id);
                        for (int i = dependent.getIndex() - 1; i >= 0; i--) {
                            PosTaggedToken head = posTagSequence.get(i);
                            if (head.getTag().getOpenClassIndicator() == PosTagOpenClassIndicator.PUNCTUATION)
                                continue;
                            configuration.addDependency(head, dependent, punctuationDepLabel, null);
                            break;
                        }
                    }
                }
            }
        }
    }
    return (configuration != null);
}
Also used : PosTaggedToken(com.joliciel.talismane.posTagger.PosTaggedToken) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) TalismaneException(com.joliciel.talismane.TalismaneException) PosTaggedToken(com.joliciel.talismane.posTagger.PosTaggedToken) Token(com.joliciel.talismane.tokeniser.Token) Decision(com.joliciel.talismane.machineLearning.Decision) ParseConfiguration(com.joliciel.talismane.parser.ParseConfiguration) PretokenisedSequence(com.joliciel.talismane.tokeniser.PretokenisedSequence) LinguisticRules(com.joliciel.talismane.LinguisticRules) SentenceAnnotator(com.joliciel.talismane.sentenceAnnotators.SentenceAnnotator) PosTagSequence(com.joliciel.talismane.posTagger.PosTagSequence) DependencyArc(com.joliciel.talismane.parser.DependencyArc) Sentence(com.joliciel.talismane.rawText.Sentence)

Aggregations

TalismaneException (com.joliciel.talismane.TalismaneException)47 ArrayList (java.util.ArrayList)27 Config (com.typesafe.config.Config)14 File (java.io.File)11 List (java.util.List)10 TreeSet (java.util.TreeSet)10 FeatureResult (com.joliciel.talismane.machineLearning.features.FeatureResult)9 IOException (java.io.IOException)9 HashMap (java.util.HashMap)9 Set (java.util.Set)9 Decision (com.joliciel.talismane.machineLearning.Decision)8 RuntimeEnvironment (com.joliciel.talismane.machineLearning.features.RuntimeEnvironment)8 PosTaggedToken (com.joliciel.talismane.posTagger.PosTaggedToken)8 Token (com.joliciel.talismane.tokeniser.Token)8 Map (java.util.Map)8 SortedSet (java.util.SortedSet)8 Collectors (java.util.stream.Collectors)8 Logger (org.slf4j.Logger)8 LoggerFactory (org.slf4j.LoggerFactory)8 Sentence (com.joliciel.talismane.rawText.Sentence)7