Search in sources :

Example 16 with ClassicCounter

use of edu.stanford.nlp.stats.ClassicCounter in project CoreNLP by stanfordnlp.

the class MentionDetectionClassifier method extractFeatures.

public static Counter<String> extractFeatures(Mention p, Set<Mention> shares, Set<String> neStrings, Dictionaries dict, Properties props) {
    Counter<String> features = new ClassicCounter<>();
    String span = p.lowercaseNormalizedSpanString();
    String ner = p.headWord.ner();
    int sIdx = p.startIndex;
    int eIdx = p.endIndex;
    List<CoreLabel> sent = p.sentenceWords;
    CoreLabel preWord = (sIdx == 0) ? null : sent.get(sIdx - 1);
    CoreLabel nextWord = (eIdx == sent.size()) ? null : sent.get(eIdx);
    CoreLabel firstWord = p.originalSpan.get(0);
    CoreLabel lastWord = p.originalSpan.get(p.originalSpan.size() - 1);
    features.incrementCount("B-NETYPE-" + ner);
    if (neStrings.contains(span)) {
        features.incrementCount("B-NE-STRING-EXIST");
        if ((preWord == null || !preWord.ner().equals(ner)) && (nextWord == null || !nextWord.ner().equals(ner))) {
            features.incrementCount("B-NE-FULLSPAN");
        }
    }
    if (preWord != null)
        features.incrementCount("B-PRECEDINGWORD-" + preWord.word());
    if (nextWord != null)
        features.incrementCount("B-FOLLOWINGWORD-" + nextWord.word());
    if (preWord != null)
        features.incrementCount("B-PRECEDINGPOS-" + preWord.tag());
    if (nextWord != null)
        features.incrementCount("B-FOLLOWINGPOS-" + nextWord.tag());
    features.incrementCount("B-FIRSTWORD-" + firstWord.word());
    features.incrementCount("B-FIRSTPOS-" + firstWord.tag());
    features.incrementCount("B-LASTWORD-" + lastWord.word());
    features.incrementCount("B-LASTWORD-" + lastWord.tag());
    for (Mention s : shares) {
        if (s == p)
            continue;
        if (s.insideIn(p)) {
            features.incrementCount("B-BIGGER-THAN-ANOTHER");
            break;
        }
    }
    for (Mention s : shares) {
        if (s == p)
            continue;
        if (p.insideIn(s)) {
            features.incrementCount("B-SMALLER-THAN-ANOTHER");
            break;
        }
    }
    return features;
}
Also used : CoreLabel(edu.stanford.nlp.ling.CoreLabel) Mention(edu.stanford.nlp.coref.data.Mention) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter)

Example 17 with ClassicCounter

use of edu.stanford.nlp.stats.ClassicCounter in project CoreNLP by stanfordnlp.

the class MentionDetectionClassifier method classifyMentions.

public void classifyMentions(List<List<Mention>> predictedMentions, Dictionaries dict, Properties props) {
    Set<String> neStrings = Generics.newHashSet();
    for (List<Mention> predictedMention : predictedMentions) {
        for (Mention m : predictedMention) {
            String ne = m.headWord.ner();
            if (ne.equals("O"))
                continue;
            for (CoreLabel cl : m.originalSpan) {
                if (!cl.ner().equals(ne))
                    continue;
            }
            neStrings.add(m.lowercaseNormalizedSpanString());
        }
    }
    for (List<Mention> predicts : predictedMentions) {
        Map<Integer, Set<Mention>> headPositions = Generics.newHashMap();
        for (Mention p : predicts) {
            if (!headPositions.containsKey(p.headIndex))
                headPositions.put(p.headIndex, Generics.newHashSet());
            headPositions.get(p.headIndex).add(p);
        }
        Set<Mention> remove = Generics.newHashSet();
        for (int hPos : headPositions.keySet()) {
            Set<Mention> shares = headPositions.get(hPos);
            if (shares.size() > 1) {
                Counter<Mention> probs = new ClassicCounter<>();
                for (Mention p : shares) {
                    double trueProb = probabilityOf(p, shares, neStrings, dict, props);
                    probs.incrementCount(p, trueProb);
                }
                // add to remove
                Mention keep = Counters.argmax(probs, (m1, m2) -> m1.spanToString().compareTo(m2.spanToString()));
                probs.remove(keep);
                remove.addAll(probs.keySet());
            }
        }
        for (Mention r : remove) {
            predicts.remove(r);
        }
    }
}
Also used : Set(java.util.Set) CoreLabel(edu.stanford.nlp.ling.CoreLabel) Mention(edu.stanford.nlp.coref.data.Mention) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter)

Example 18 with ClassicCounter

use of edu.stanford.nlp.stats.ClassicCounter in project CoreNLP by stanfordnlp.

the class NeuralCorefAlgorithm method runCoref.

@Override
public void runCoref(Document document) {
    List<Mention> sortedMentions = CorefUtils.getSortedMentions(document);
    Map<Integer, List<Mention>> mentionsByHeadIndex = new HashMap<>();
    for (Mention m : sortedMentions) {
        List<Mention> withIndex = mentionsByHeadIndex.get(m.headIndex);
        if (withIndex == null) {
            withIndex = new ArrayList<>();
            mentionsByHeadIndex.put(m.headIndex, withIndex);
        }
        withIndex.add(m);
    }
    SimpleMatrix documentEmbedding = embeddingExtractor.getDocumentEmbedding(document);
    Map<Integer, SimpleMatrix> antecedentEmbeddings = new HashMap<>();
    Map<Integer, SimpleMatrix> anaphorEmbeddings = new HashMap<>();
    Counter<Integer> anaphoricityScores = new ClassicCounter<>();
    for (Mention m : sortedMentions) {
        SimpleMatrix mentionEmbedding = embeddingExtractor.getMentionEmbeddings(m, documentEmbedding);
        antecedentEmbeddings.put(m.mentionID, model.getAntecedentEmbedding(mentionEmbedding));
        anaphorEmbeddings.put(m.mentionID, model.getAnaphorEmbedding(mentionEmbedding));
        anaphoricityScores.incrementCount(m.mentionID, model.getAnaphoricityScore(mentionEmbedding, featureExtractor.getAnaphoricityFeatures(m, document, mentionsByHeadIndex)));
    }
    Map<Integer, List<Integer>> mentionToCandidateAntecedents = CorefUtils.heuristicFilter(sortedMentions, maxMentionDistance, maxMentionDistanceWithStringMatch);
    for (Map.Entry<Integer, List<Integer>> e : mentionToCandidateAntecedents.entrySet()) {
        double bestScore = anaphoricityScores.getCount(e.getKey()) - 50 * (greedyness - 0.5);
        int m = e.getKey();
        Integer antecedent = null;
        for (int ca : e.getValue()) {
            double score = model.getPairwiseScore(antecedentEmbeddings.get(ca), anaphorEmbeddings.get(m), featureExtractor.getPairFeatures(new Pair<>(ca, m), document, mentionsByHeadIndex));
            if (score > bestScore) {
                bestScore = score;
                antecedent = ca;
            }
        }
        if (antecedent != null) {
            CorefUtils.mergeCoreferenceClusters(new Pair<>(antecedent, m), document);
        }
    }
}
Also used : HashMap(java.util.HashMap) SimpleMatrix(org.ejml.simple.SimpleMatrix) Mention(edu.stanford.nlp.coref.data.Mention) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) ArrayList(java.util.ArrayList) List(java.util.List) HashMap(java.util.HashMap) Map(java.util.Map) Pair(edu.stanford.nlp.util.Pair)

Example 19 with ClassicCounter

use of edu.stanford.nlp.stats.ClassicCounter in project CoreNLP by stanfordnlp.

the class FeatureExtractor method getFeatures.

private Counter<String> getFeatures(Document doc, Mention m1, Mention m2) {
    assert (m1.appearEarlierThan(m2));
    Counter<String> features = new ClassicCounter<>();
    // global features
    features.incrementCount("bias");
    if (useDocSource) {
        features.incrementCount("doc-type=" + doc.docType);
        if (doc.docInfo != null && doc.docInfo.containsKey("DOC_ID")) {
            features.incrementCount("doc-source=" + doc.docInfo.get("DOC_ID").split("/")[1]);
        }
    }
    // singleton feature conjunctions
    List<String> singletonFeatures1 = m1.getSingletonFeatures(dictionaries);
    List<String> singletonFeatures2 = m2.getSingletonFeatures(dictionaries);
    for (Map.Entry<Integer, String> e : SINGLETON_FEATURES.entrySet()) {
        if (e.getKey() < singletonFeatures1.size() && e.getKey() < singletonFeatures2.size()) {
            features.incrementCount(e.getValue() + "=" + singletonFeatures1.get(e.getKey()) + "_" + singletonFeatures2.get(e.getKey()));
        }
    }
    SemanticGraphEdge p1 = getDependencyParent(m1);
    SemanticGraphEdge p2 = getDependencyParent(m2);
    features.incrementCount("dep-relations=" + (p1 == null ? "null" : p1.getRelation()) + "_" + (p2 == null ? "null" : p2.getRelation()));
    features.incrementCount("roles=" + getRole(m1) + "_" + getRole(m2));
    CoreLabel headCL1 = headWord(m1);
    CoreLabel headCL2 = headWord(m2);
    String headPOS1 = getPOS(headCL1);
    String headPOS2 = getPOS(headCL2);
    features.incrementCount("head-pos-s=" + headPOS1 + "_" + headPOS2);
    features.incrementCount("head-words=" + wordIndicator("h_" + headCL1.word().toLowerCase() + "_" + headCL2.word().toLowerCase(), headPOS1 + "_" + headPOS2));
    // agreement features
    addFeature(features, "animacies-agree", m2.animaciesAgree(m1));
    addFeature(features, "attributes-agree", m2.attributesAgree(m1, dictionaries));
    addFeature(features, "entity-types-agree", m2.entityTypesAgree(m1, dictionaries));
    addFeature(features, "numbers-agree", m2.numbersAgree(m1));
    addFeature(features, "genders-agree", m2.gendersAgree(m1));
    addFeature(features, "ner-strings-equal", m1.nerString.equals(m2.nerString));
    // string matching features
    addFeature(features, "antecedent-head-in-anaphor", headContainedIn(m1, m2));
    addFeature(features, "anaphor-head-in-antecedent", headContainedIn(m2, m1));
    if (m1.mentionType != MentionType.PRONOMINAL && m2.mentionType != MentionType.PRONOMINAL) {
        addFeature(features, "antecedent-in-anaphor", m2.spanToString().toLowerCase().contains(m1.spanToString().toLowerCase()));
        addFeature(features, "anaphor-in-antecedent", m1.spanToString().toLowerCase().contains(m2.spanToString().toLowerCase()));
        addFeature(features, "heads-equal", m1.headString.equalsIgnoreCase(m2.headString));
        addFeature(features, "heads-agree", m2.headsAgree(m1));
        addFeature(features, "exact-match", m1.toString().trim().toLowerCase().equals(m2.toString().trim().toLowerCase()));
        addFeature(features, "partial-match", relaxedStringMatch(m1, m2));
        double editDistance = StringUtils.editDistance(m1.spanToString(), m2.spanToString()) / (double) (m1.spanToString().length() + m2.spanToString().length());
        features.incrementCount("edit-distance", editDistance);
        features.incrementCount("edit-distance=" + ((int) (editDistance * 10) / 10.0));
        double headEditDistance = StringUtils.editDistance(m1.headString, m2.headString) / (double) (m1.headString.length() + m2.headString.length());
        features.incrementCount("head-edit-distance", headEditDistance);
        features.incrementCount("head-edit-distance=" + ((int) (headEditDistance * 10) / 10.0));
    }
    // distance features
    addNumeric(features, "mention-distance", m2.mentionNum - m1.mentionNum);
    addNumeric(features, "sentence-distance", m2.sentNum - m1.sentNum);
    if (m2.sentNum == m1.sentNum) {
        addNumeric(features, "word-distance", m2.startIndex - m1.endIndex);
        if (m1.endIndex > m2.startIndex) {
            features.incrementCount("spans-intersect");
        }
    }
    // setup for dcoref features
    Set<Mention> ms1 = new HashSet<>();
    ms1.add(m1);
    Set<Mention> ms2 = new HashSet<>();
    ms2.add(m2);
    Random r = new Random();
    CorefCluster c1 = new CorefCluster(20000 + r.nextInt(10000), ms1);
    CorefCluster c2 = new CorefCluster(10000 + r.nextInt(10000), ms2);
    String s2 = m2.lowercaseNormalizedSpanString();
    String s1 = m1.lowercaseNormalizedSpanString();
    // discourse dcoref features
    addFeature(features, "mention-speaker-PER0", m2.headWord.get(SpeakerAnnotation.class).equalsIgnoreCase("PER0"));
    addFeature(features, "antecedent-is-anaphor-speaker", CorefRules.antecedentIsMentionSpeaker(doc, m2, m1, dictionaries));
    addFeature(features, "same-speaker", CorefRules.entitySameSpeaker(doc, m2, m1));
    addFeature(features, "person-disagree-same-speaker", CorefRules.entityPersonDisagree(doc, m2, m1, dictionaries) && CorefRules.entitySameSpeaker(doc, m2, m1));
    addFeature(features, "antecedent-matches-anaphor-speaker", CorefRules.antecedentMatchesMentionSpeakerAnnotation(m2, m1, doc));
    addFeature(features, "discourse-you-PER0", m2.person == Person.YOU && doc.docType == DocType.ARTICLE && m2.headWord.get(CoreAnnotations.SpeakerAnnotation.class).equals("PER0"));
    addFeature(features, "speaker-match-i-i", m2.number == Number.SINGULAR && dictionaries.firstPersonPronouns.contains(s1) && m1.number == Number.SINGULAR && dictionaries.firstPersonPronouns.contains(s2) && CorefRules.entitySameSpeaker(doc, m2, m1));
    addFeature(features, "speaker-match-speaker-i", m2.number == Number.SINGULAR && dictionaries.firstPersonPronouns.contains(s2) && CorefRules.antecedentIsMentionSpeaker(doc, m2, m1, dictionaries));
    addFeature(features, "speaker-match-i-speaker", m1.number == Number.SINGULAR && dictionaries.firstPersonPronouns.contains(s1) && CorefRules.antecedentIsMentionSpeaker(doc, m1, m2, dictionaries));
    addFeature(features, "speaker-match-you-you", dictionaries.secondPersonPronouns.contains(s1) && dictionaries.secondPersonPronouns.contains(s2) && CorefRules.entitySameSpeaker(doc, m2, m1));
    addFeature(features, "discourse-between-two-person", ((m2.person == Person.I && m1.person == Person.YOU || (m2.person == Person.YOU && m1.person == Person.I)) && (m2.headWord.get(CoreAnnotations.UtteranceAnnotation.class) - m1.headWord.get(CoreAnnotations.UtteranceAnnotation.class) == 1) && doc.docType == DocType.CONVERSATION));
    addFeature(features, "incompatible-not-match", m1.person != Person.I && m2.person != Person.I && (CorefRules.antecedentIsMentionSpeaker(doc, m1, m2, dictionaries) || CorefRules.antecedentIsMentionSpeaker(doc, m2, m1, dictionaries)));
    int utteranceDist = Math.abs(m1.headWord.get(CoreAnnotations.UtteranceAnnotation.class) - m2.headWord.get(CoreAnnotations.UtteranceAnnotation.class));
    if (doc.docType != DocType.ARTICLE && utteranceDist == 1 && !CorefRules.entitySameSpeaker(doc, m2, m1)) {
        addFeature(features, "speaker-mismatch-i-i", m1.person == Person.I && m2.person == Person.I);
        addFeature(features, "speaker-mismatch-you-you", m1.person == Person.YOU && m2.person == Person.YOU);
        addFeature(features, "speaker-mismatch-we-we", m1.person == Person.WE && m2.person == Person.WE);
    }
    // other dcoref features
    String firstWord1 = firstWord(m1).word().toLowerCase();
    addFeature(features, "indefinite-article-np", (m1.appositions == null && m1.predicateNominatives == null && (firstWord1.equals("a") || firstWord1.equals("an"))));
    addFeature(features, "far-this", m2.lowercaseNormalizedSpanString().equals("this") && Math.abs(m2.sentNum - m1.sentNum) > 3);
    addFeature(features, "per0-you-in-article", m2.person == Person.YOU && doc.docType == DocType.ARTICLE && m2.headWord.get(CoreAnnotations.SpeakerAnnotation.class).equals("PER0"));
    addFeature(features, "inside-in", m2.insideIn(m1) || m1.insideIn(m2));
    addFeature(features, "indefinite-determiners", dictionaries.indefinitePronouns.contains(m1.originalSpan.get(0).lemma()) || dictionaries.indefinitePronouns.contains(m2.originalSpan.get(0).lemma()));
    addFeature(features, "entity-attributes-agree", CorefRules.entityAttributesAgree(c2, c1));
    addFeature(features, "entity-token-distance", CorefRules.entityTokenDistance(m2, m1));
    addFeature(features, "i-within-i", CorefRules.entityIWithinI(m2, m1, dictionaries));
    addFeature(features, "exact-string-match", CorefRules.entityExactStringMatch(c2, c1, dictionaries, doc.roleSet));
    addFeature(features, "entity-relaxed-heads-agree", CorefRules.entityRelaxedHeadsAgreeBetweenMentions(c2, c1, m2, m1));
    addFeature(features, "is-acronym", CorefRules.entityIsAcronym(doc, c2, c1));
    addFeature(features, "demonym", m2.isDemonym(m1, dictionaries));
    addFeature(features, "incompatible-modifier", CorefRules.entityHaveIncompatibleModifier(m2, m1));
    addFeature(features, "head-lemma-match", m1.headWord.lemma().equals(m2.headWord.lemma()));
    addFeature(features, "words-included", CorefRules.entityWordsIncluded(c2, c1, m2, m1));
    addFeature(features, "extra-proper-noun", CorefRules.entityHaveExtraProperNoun(m2, m1, new HashSet<>()));
    addFeature(features, "number-in-later-mentions", CorefRules.entityNumberInLaterMention(m2, m1));
    addFeature(features, "sentence-context-incompatible", CorefRules.sentenceContextIncompatible(m2, m1, dictionaries));
    // syntax features
    if (useConstituencyParse) {
        if (m1.sentNum == m2.sentNum) {
            int clauseCount = 0;
            Tree tree = m2.contextParseTree;
            Tree current = m2.mentionSubTree;
            while (true) {
                current = current.ancestor(1, tree);
                if (current.label().value().startsWith("S")) {
                    clauseCount++;
                }
                if (current.dominates(m1.mentionSubTree)) {
                    break;
                }
                if (current.label().value().equals("ROOT") || current.ancestor(1, tree) == null) {
                    break;
                }
            }
            features.incrementCount("clause-count", clauseCount);
            features.incrementCount("clause-count=" + bin(clauseCount));
        }
        if (RuleBasedCorefMentionFinder.isPleonastic(m2, m2.contextParseTree) || RuleBasedCorefMentionFinder.isPleonastic(m1, m1.contextParseTree)) {
            features.incrementCount("pleonastic-it");
        }
        if (maximalNp(m1.mentionSubTree) == maximalNp(m2.mentionSubTree)) {
            features.incrementCount("same-maximal-np");
        }
        boolean m1Embedded = headEmbeddingLevel(m1.mentionSubTree, m1.headIndex - m1.startIndex) > 1;
        boolean m2Embedded = headEmbeddingLevel(m2.mentionSubTree, m2.headIndex - m2.startIndex) > 1;
        features.incrementCount("embedding=" + m1Embedded + "_" + m2Embedded);
    }
    return features;
}
Also used : SemanticGraphEdge(edu.stanford.nlp.semgraph.SemanticGraphEdge) CoreLabel(edu.stanford.nlp.ling.CoreLabel) Random(java.util.Random) CorefCluster(edu.stanford.nlp.coref.data.CorefCluster) Mention(edu.stanford.nlp.coref.data.Mention) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) Tree(edu.stanford.nlp.trees.Tree) HashMap(java.util.HashMap) Map(java.util.Map) HashSet(java.util.HashSet) SpeakerAnnotation(edu.stanford.nlp.ling.CoreAnnotations.SpeakerAnnotation)

Example 20 with ClassicCounter

use of edu.stanford.nlp.stats.ClassicCounter in project CoreNLP by stanfordnlp.

the class FeatureExtractor method getFeatures.

private Counter<String> getFeatures(Document doc, Mention m, Map<Integer, List<Mention>> mentionsByHeadIndex) {
    Counter<String> features = new ClassicCounter<>();
    // type features
    features.incrementCount("mention-type=" + m.mentionType);
    features.incrementCount("gender=" + m.gender);
    features.incrementCount("person-fine=" + m.person);
    features.incrementCount("head-ne-type=" + m.nerString);
    List<String> singletonFeatures = m.getSingletonFeatures(dictionaries);
    for (Map.Entry<Integer, String> e : SINGLETON_FEATURES.entrySet()) {
        if (e.getKey() < singletonFeatures.size()) {
            features.incrementCount(e.getValue() + "=" + singletonFeatures.get(e.getKey()));
        }
    }
    // length and location features
    addNumeric(features, "mention-length", m.spanToString().length());
    addNumeric(features, "mention-words", m.originalSpan.size());
    addNumeric(features, "sentence-words", m.sentenceWords.size());
    features.incrementCount("sentence-words=" + bin(m.sentenceWords.size()));
    features.incrementCount("mention-position", m.mentionNum / (double) doc.predictedMentions.size());
    features.incrementCount("sentence-position", m.sentNum / (double) doc.numSentences);
    // lexical features
    CoreLabel firstWord = firstWord(m);
    CoreLabel lastWord = lastWord(m);
    CoreLabel headWord = headWord(m);
    CoreLabel prevWord = prevWord(m);
    CoreLabel nextWord = nextWord(m);
    CoreLabel prevprevWord = prevprevWord(m);
    CoreLabel nextnextWord = nextnextWord(m);
    String headPOS = getPOS(headWord);
    String firstPOS = getPOS(firstWord);
    String lastPOS = getPOS(lastWord);
    String prevPOS = getPOS(prevWord);
    String nextPOS = getPOS(nextWord);
    String prevprevPOS = getPOS(prevprevWord);
    String nextnextPOS = getPOS(nextnextWord);
    features.incrementCount("first-word=" + wordIndicator(firstWord, firstPOS));
    features.incrementCount("last-word=" + wordIndicator(lastWord, lastPOS));
    features.incrementCount("head-word=" + wordIndicator(headWord, headPOS));
    features.incrementCount("next-word=" + wordIndicator(nextWord, nextPOS));
    features.incrementCount("prev-word=" + wordIndicator(prevWord, prevPOS));
    features.incrementCount("next-bigram=" + wordIndicator(nextWord, nextnextWord, nextPOS + "_" + nextnextPOS));
    features.incrementCount("prev-bigram=" + wordIndicator(prevprevWord, prevWord, prevprevPOS + "_" + prevPOS));
    features.incrementCount("next-pos=" + nextPOS);
    features.incrementCount("prev-pos=" + prevPOS);
    features.incrementCount("first-pos=" + firstPOS);
    features.incrementCount("last-pos=" + lastPOS);
    features.incrementCount("next-pos-bigram=" + nextPOS + "_" + nextnextPOS);
    features.incrementCount("prev-pos-bigram=" + prevprevPOS + "_" + prevPOS);
    addDependencyFeatures(features, "parent", getDependencyParent(m), true);
    addFeature(features, "ends-with-head", m.headIndex == m.endIndex - 1);
    addFeature(features, "is-generic", m.originalSpan.size() == 1 && firstPOS.equals("NNS"));
    // syntax features
    IndexedWord w = m.headIndexedWord;
    String depPath = "";
    int depth = 0;
    while (w != null) {
        SemanticGraphEdge e = getDependencyParent(m, w);
        depth++;
        if (depth <= 3 && e != null) {
            depPath += (depPath.isEmpty() ? "" : "_") + e.getRelation().toString();
            features.incrementCount("dep-path=" + depPath);
            w = e.getSource();
        } else {
            w = null;
        }
    }
    if (useConstituencyParse) {
        int fullEmbeddingLevel = headEmbeddingLevel(m.contextParseTree, m.headIndex);
        int mentionEmbeddingLevel = headEmbeddingLevel(m.mentionSubTree, m.headIndex - m.startIndex);
        if (fullEmbeddingLevel != -1 && mentionEmbeddingLevel != -1) {
            features.incrementCount("mention-embedding-level=" + bin(fullEmbeddingLevel - mentionEmbeddingLevel));
            features.incrementCount("head-embedding-level=" + bin(mentionEmbeddingLevel));
        } else {
            features.incrementCount("undetermined-embedding-level");
        }
        features.incrementCount("num-embedded-nps=" + bin(numEmbeddedNps(m.mentionSubTree)));
        String syntaxPath = "";
        Tree tree = m.contextParseTree;
        Tree head = tree.getLeaves().get(m.headIndex).ancestor(1, tree);
        depth = 0;
        for (Tree node : tree.pathNodeToNode(head, tree)) {
            syntaxPath += node.value() + "-";
            features.incrementCount("syntax-path=" + syntaxPath);
            depth++;
            if (depth >= 4 || node.value().equals("S")) {
                break;
            }
        }
    }
    // mention containment features
    addFeature(features, "contained-in-other-mention", mentionsByHeadIndex.get(m.headIndex).stream().anyMatch(m2 -> m != m2 && m.insideIn(m2)));
    addFeature(features, "contains-other-mention", mentionsByHeadIndex.get(m.headIndex).stream().anyMatch(m2 -> m != m2 && m2.insideIn(m)));
    // features from dcoref rules
    addFeature(features, "bare-plural", m.originalSpan.size() == 1 && headPOS.equals("NNS"));
    addFeature(features, "quantifier-start", dictionaries.quantifiers.contains(firstWord.word().toLowerCase()));
    addFeature(features, "negative-start", firstWord.word().toLowerCase().matches("none|no|nothing|not"));
    addFeature(features, "partitive", RuleBasedCorefMentionFinder.partitiveRule(m, m.sentenceWords, dictionaries));
    addFeature(features, "adjectival-demonym", dictionaries.isAdjectivalDemonym(m.spanToString()));
    if (doc.docType != DocType.ARTICLE && m.person == Person.YOU && nextWord != null && nextWord.word().equalsIgnoreCase("know")) {
        features.incrementCount("generic-you");
    }
    return features;
}
Also used : SpeakerAnnotation(edu.stanford.nlp.ling.CoreAnnotations.SpeakerAnnotation) Tree(edu.stanford.nlp.trees.Tree) HashMap(java.util.HashMap) Random(java.util.Random) Dictionaries(edu.stanford.nlp.coref.data.Dictionaries) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Number(edu.stanford.nlp.coref.data.Dictionaries.Number) CorefCluster(edu.stanford.nlp.coref.data.CorefCluster) Mention(edu.stanford.nlp.coref.data.Mention) RuleBasedCorefMentionFinder(edu.stanford.nlp.coref.md.RuleBasedCorefMentionFinder) Counter(edu.stanford.nlp.stats.Counter) Map(java.util.Map) Pair(edu.stanford.nlp.util.Pair) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) CorefRules(edu.stanford.nlp.coref.CorefRules) IndexedWord(edu.stanford.nlp.ling.IndexedWord) CoreLabel(edu.stanford.nlp.ling.CoreLabel) Properties(java.util.Properties) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) SemanticGraphEdge(edu.stanford.nlp.semgraph.SemanticGraphEdge) Iterator(java.util.Iterator) IOUtils(edu.stanford.nlp.io.IOUtils) DocType(edu.stanford.nlp.coref.data.Document.DocType) Set(java.util.Set) Person(edu.stanford.nlp.coref.data.Dictionaries.Person) List(java.util.List) MentionType(edu.stanford.nlp.coref.data.Dictionaries.MentionType) StringUtils(edu.stanford.nlp.util.StringUtils) CorefProperties(edu.stanford.nlp.coref.CorefProperties) Document(edu.stanford.nlp.coref.data.Document) CorefUtils(edu.stanford.nlp.coref.CorefUtils) SemanticGraphEdge(edu.stanford.nlp.semgraph.SemanticGraphEdge) CoreLabel(edu.stanford.nlp.ling.CoreLabel) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) Tree(edu.stanford.nlp.trees.Tree) IndexedWord(edu.stanford.nlp.ling.IndexedWord) HashMap(java.util.HashMap) Map(java.util.Map)

Aggregations

ClassicCounter (edu.stanford.nlp.stats.ClassicCounter)69 CoreLabel (edu.stanford.nlp.ling.CoreLabel)27 ArrayList (java.util.ArrayList)21 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)18 Tree (edu.stanford.nlp.trees.Tree)13 Pair (edu.stanford.nlp.util.Pair)11 Counter (edu.stanford.nlp.stats.Counter)10 List (java.util.List)10 Mention (edu.stanford.nlp.coref.data.Mention)8 Language (edu.stanford.nlp.international.Language)7 RuntimeIOException (edu.stanford.nlp.io.RuntimeIOException)7 CoreMap (edu.stanford.nlp.util.CoreMap)7 IOUtils (edu.stanford.nlp.io.IOUtils)6 Label (edu.stanford.nlp.ling.Label)6 TreebankLangParserParams (edu.stanford.nlp.parser.lexparser.TreebankLangParserParams)6 PrintWriter (java.io.PrintWriter)6 java.util (java.util)6 HashSet (java.util.HashSet)6 RVFDatum (edu.stanford.nlp.ling.RVFDatum)5 DiskTreebank (edu.stanford.nlp.trees.DiskTreebank)5