Search in sources :

Example 1 with CorefCluster

use of edu.stanford.nlp.coref.data.CorefCluster in project CoreNLP by stanfordnlp.

the class CorefSystem method annotate.

public void annotate(Annotation ann) {
    Document document;
    try {
        document = docMaker.makeDocument(ann);
    } catch (Exception e) {
        throw new RuntimeException("Error making document", e);
    }
    CorefUtils.checkForInterrupt();
    corefAlgorithm.runCoref(document);
    if (removeSingletonClusters) {
        CorefUtils.removeSingletonClusters(document);
    }
    CorefUtils.checkForInterrupt();
    Map<Integer, CorefChain> result = Generics.newHashMap();
    for (CorefCluster c : document.corefClusters.values()) {
        result.put(c.clusterID, new CorefChain(c, document.positions));
    }
    ann.set(CorefCoreAnnotations.CorefChainAnnotation.class, result);
}
Also used : CorefCluster(edu.stanford.nlp.coref.data.CorefCluster) CorefChain(edu.stanford.nlp.coref.data.CorefChain) Document(edu.stanford.nlp.coref.data.Document) IOException(java.io.IOException)

Example 2 with CorefCluster

use of edu.stanford.nlp.coref.data.CorefCluster in project CoreNLP by stanfordnlp.

the class CorefSystem method runOnConll.

public void runOnConll(Properties props) throws Exception {
    File f = new File(CorefProperties.conllOutputPath(props));
    if (!f.exists()) {
        f.mkdirs();
    }
    String timestamp = Calendar.getInstance().getTime().toString().replaceAll("\\s", "-").replaceAll(":", "-");
    String baseName = CorefProperties.conllOutputPath(props) + timestamp;
    String goldOutput = baseName + ".gold.txt";
    String beforeCorefOutput = baseName + ".predicted.txt";
    String afterCorefOutput = baseName + ".coref.predicted.txt";
    PrintWriter writerGold = new PrintWriter(new FileOutputStream(goldOutput));
    PrintWriter writerBeforeCoref = new PrintWriter(new FileOutputStream(beforeCorefOutput));
    PrintWriter writerAfterCoref = new PrintWriter(new FileOutputStream(afterCorefOutput));
    Logger logger = Logger.getLogger(CorefSystem.class.getName());
    initLogger(logger, baseName + ".log");
    logger.info(timestamp);
    logger.info(props.toString());
    (new CorefDocumentProcessor() {

        @Override
        public void process(int id, Document document) {
            writerGold.print(CorefPrinter.printConllOutput(document, true));
            writerBeforeCoref.print(CorefPrinter.printConllOutput(document, false));
            long time = System.currentTimeMillis();
            corefAlgorithm.runCoref(document);
            if (verbose) {
                Redwood.log(getName(), "Coref took " + (System.currentTimeMillis() - time) / 1000.0 + "s");
            }
            CorefUtils.removeSingletonClusters(document);
            if (verbose) {
                CorefUtils.printHumanReadableCoref(document);
            }
            if (document.filterMentionSet != null) {
                Map<Integer, CorefCluster> filteredClusters = document.corefClusters.values().stream().filter(x -> CorefUtils.filterClustersWithMentionSpans(x, document.filterMentionSet)).collect(Collectors.toMap(x -> x.clusterID, x -> x));
                writerAfterCoref.print(CorefPrinter.printConllOutput(document, false, true, filteredClusters));
            } else {
                writerAfterCoref.print(CorefPrinter.printConllOutput(document, false, true));
            }
        }

        @Override
        public void finish() throws Exception {
        }

        @Override
        public String getName() {
            return corefAlgorithm.getClass().getName();
        }
    }).run(docMaker);
    String summary = CorefScorer.getEvalSummary(CorefProperties.getScorerPath(props), goldOutput, beforeCorefOutput);
    logger.info("Before Coref");
    CorefScorer.printScoreSummary(summary, logger, false);
    CorefScorer.printScoreSummary(summary, logger, true);
    CorefScorer.printFinalConllScore(summary, logger);
    summary = CorefScorer.getEvalSummary(CorefProperties.getScorerPath(props), goldOutput, afterCorefOutput);
    logger.info("After Coref");
    CorefScorer.printScoreSummary(summary, logger, false);
    CorefScorer.printScoreSummary(summary, logger, true);
    CorefScorer.printFinalConllScore(summary, logger);
    writerGold.close();
    writerBeforeCoref.close();
    writerAfterCoref.close();
}
Also used : PrintWriter(java.io.PrintWriter) NewlineLogFormatter(edu.stanford.nlp.util.logging.NewlineLogFormatter) Properties(java.util.Properties) CorefChain(edu.stanford.nlp.coref.data.CorefChain) Redwood(edu.stanford.nlp.util.logging.Redwood) FileOutputStream(java.io.FileOutputStream) IOException(java.io.IOException) DocumentMaker(edu.stanford.nlp.coref.data.DocumentMaker) Logger(java.util.logging.Logger) Dictionaries(edu.stanford.nlp.coref.data.Dictionaries) Collectors(java.util.stream.Collectors) File(java.io.File) Level(java.util.logging.Level) FileHandler(java.util.logging.FileHandler) CorefCluster(edu.stanford.nlp.coref.data.CorefCluster) Calendar(java.util.Calendar) Annotation(edu.stanford.nlp.pipeline.Annotation) StringUtils(edu.stanford.nlp.util.StringUtils) Map(java.util.Map) Document(edu.stanford.nlp.coref.data.Document) Generics(edu.stanford.nlp.util.Generics) Logger(java.util.logging.Logger) Document(edu.stanford.nlp.coref.data.Document) CorefCluster(edu.stanford.nlp.coref.data.CorefCluster) FileOutputStream(java.io.FileOutputStream) File(java.io.File) PrintWriter(java.io.PrintWriter)

Example 3 with CorefCluster

use of edu.stanford.nlp.coref.data.CorefCluster in project CoreNLP by stanfordnlp.

the class FeatureExtractor method getFeatures.

private Counter<String> getFeatures(Document doc, Mention m1, Mention m2) {
    assert (m1.appearEarlierThan(m2));
    Counter<String> features = new ClassicCounter<>();
    // global features
    features.incrementCount("bias");
    if (useDocSource) {
        features.incrementCount("doc-type=" + doc.docType);
        if (doc.docInfo != null && doc.docInfo.containsKey("DOC_ID")) {
            features.incrementCount("doc-source=" + doc.docInfo.get("DOC_ID").split("/")[1]);
        }
    }
    // singleton feature conjunctions
    List<String> singletonFeatures1 = m1.getSingletonFeatures(dictionaries);
    List<String> singletonFeatures2 = m2.getSingletonFeatures(dictionaries);
    for (Map.Entry<Integer, String> e : SINGLETON_FEATURES.entrySet()) {
        if (e.getKey() < singletonFeatures1.size() && e.getKey() < singletonFeatures2.size()) {
            features.incrementCount(e.getValue() + "=" + singletonFeatures1.get(e.getKey()) + "_" + singletonFeatures2.get(e.getKey()));
        }
    }
    SemanticGraphEdge p1 = getDependencyParent(m1);
    SemanticGraphEdge p2 = getDependencyParent(m2);
    features.incrementCount("dep-relations=" + (p1 == null ? "null" : p1.getRelation()) + "_" + (p2 == null ? "null" : p2.getRelation()));
    features.incrementCount("roles=" + getRole(m1) + "_" + getRole(m2));
    CoreLabel headCL1 = headWord(m1);
    CoreLabel headCL2 = headWord(m2);
    String headPOS1 = getPOS(headCL1);
    String headPOS2 = getPOS(headCL2);
    features.incrementCount("head-pos-s=" + headPOS1 + "_" + headPOS2);
    features.incrementCount("head-words=" + wordIndicator("h_" + headCL1.word().toLowerCase() + "_" + headCL2.word().toLowerCase(), headPOS1 + "_" + headPOS2));
    // agreement features
    addFeature(features, "animacies-agree", m2.animaciesAgree(m1));
    addFeature(features, "attributes-agree", m2.attributesAgree(m1, dictionaries));
    addFeature(features, "entity-types-agree", m2.entityTypesAgree(m1, dictionaries));
    addFeature(features, "numbers-agree", m2.numbersAgree(m1));
    addFeature(features, "genders-agree", m2.gendersAgree(m1));
    addFeature(features, "ner-strings-equal", m1.nerString.equals(m2.nerString));
    // string matching features
    addFeature(features, "antecedent-head-in-anaphor", headContainedIn(m1, m2));
    addFeature(features, "anaphor-head-in-antecedent", headContainedIn(m2, m1));
    if (m1.mentionType != MentionType.PRONOMINAL && m2.mentionType != MentionType.PRONOMINAL) {
        addFeature(features, "antecedent-in-anaphor", m2.spanToString().toLowerCase().contains(m1.spanToString().toLowerCase()));
        addFeature(features, "anaphor-in-antecedent", m1.spanToString().toLowerCase().contains(m2.spanToString().toLowerCase()));
        addFeature(features, "heads-equal", m1.headString.equalsIgnoreCase(m2.headString));
        addFeature(features, "heads-agree", m2.headsAgree(m1));
        addFeature(features, "exact-match", m1.toString().trim().toLowerCase().equals(m2.toString().trim().toLowerCase()));
        addFeature(features, "partial-match", relaxedStringMatch(m1, m2));
        double editDistance = StringUtils.editDistance(m1.spanToString(), m2.spanToString()) / (double) (m1.spanToString().length() + m2.spanToString().length());
        features.incrementCount("edit-distance", editDistance);
        features.incrementCount("edit-distance=" + ((int) (editDistance * 10) / 10.0));
        double headEditDistance = StringUtils.editDistance(m1.headString, m2.headString) / (double) (m1.headString.length() + m2.headString.length());
        features.incrementCount("head-edit-distance", headEditDistance);
        features.incrementCount("head-edit-distance=" + ((int) (headEditDistance * 10) / 10.0));
    }
    // distance features
    addNumeric(features, "mention-distance", m2.mentionNum - m1.mentionNum);
    addNumeric(features, "sentence-distance", m2.sentNum - m1.sentNum);
    if (m2.sentNum == m1.sentNum) {
        addNumeric(features, "word-distance", m2.startIndex - m1.endIndex);
        if (m1.endIndex > m2.startIndex) {
            features.incrementCount("spans-intersect");
        }
    }
    // setup for dcoref features
    Set<Mention> ms1 = new HashSet<>();
    ms1.add(m1);
    Set<Mention> ms2 = new HashSet<>();
    ms2.add(m2);
    Random r = new Random();
    CorefCluster c1 = new CorefCluster(20000 + r.nextInt(10000), ms1);
    CorefCluster c2 = new CorefCluster(10000 + r.nextInt(10000), ms2);
    String s2 = m2.lowercaseNormalizedSpanString();
    String s1 = m1.lowercaseNormalizedSpanString();
    // discourse dcoref features
    addFeature(features, "mention-speaker-PER0", m2.headWord.get(SpeakerAnnotation.class).equalsIgnoreCase("PER0"));
    addFeature(features, "antecedent-is-anaphor-speaker", CorefRules.antecedentIsMentionSpeaker(doc, m2, m1, dictionaries));
    addFeature(features, "same-speaker", CorefRules.entitySameSpeaker(doc, m2, m1));
    addFeature(features, "person-disagree-same-speaker", CorefRules.entityPersonDisagree(doc, m2, m1, dictionaries) && CorefRules.entitySameSpeaker(doc, m2, m1));
    addFeature(features, "antecedent-matches-anaphor-speaker", CorefRules.antecedentMatchesMentionSpeakerAnnotation(m2, m1, doc));
    addFeature(features, "discourse-you-PER0", m2.person == Person.YOU && doc.docType == DocType.ARTICLE && m2.headWord.get(CoreAnnotations.SpeakerAnnotation.class).equals("PER0"));
    addFeature(features, "speaker-match-i-i", m2.number == Number.SINGULAR && dictionaries.firstPersonPronouns.contains(s1) && m1.number == Number.SINGULAR && dictionaries.firstPersonPronouns.contains(s2) && CorefRules.entitySameSpeaker(doc, m2, m1));
    addFeature(features, "speaker-match-speaker-i", m2.number == Number.SINGULAR && dictionaries.firstPersonPronouns.contains(s2) && CorefRules.antecedentIsMentionSpeaker(doc, m2, m1, dictionaries));
    addFeature(features, "speaker-match-i-speaker", m1.number == Number.SINGULAR && dictionaries.firstPersonPronouns.contains(s1) && CorefRules.antecedentIsMentionSpeaker(doc, m1, m2, dictionaries));
    addFeature(features, "speaker-match-you-you", dictionaries.secondPersonPronouns.contains(s1) && dictionaries.secondPersonPronouns.contains(s2) && CorefRules.entitySameSpeaker(doc, m2, m1));
    addFeature(features, "discourse-between-two-person", ((m2.person == Person.I && m1.person == Person.YOU || (m2.person == Person.YOU && m1.person == Person.I)) && (m2.headWord.get(CoreAnnotations.UtteranceAnnotation.class) - m1.headWord.get(CoreAnnotations.UtteranceAnnotation.class) == 1) && doc.docType == DocType.CONVERSATION));
    addFeature(features, "incompatible-not-match", m1.person != Person.I && m2.person != Person.I && (CorefRules.antecedentIsMentionSpeaker(doc, m1, m2, dictionaries) || CorefRules.antecedentIsMentionSpeaker(doc, m2, m1, dictionaries)));
    int utteranceDist = Math.abs(m1.headWord.get(CoreAnnotations.UtteranceAnnotation.class) - m2.headWord.get(CoreAnnotations.UtteranceAnnotation.class));
    if (doc.docType != DocType.ARTICLE && utteranceDist == 1 && !CorefRules.entitySameSpeaker(doc, m2, m1)) {
        addFeature(features, "speaker-mismatch-i-i", m1.person == Person.I && m2.person == Person.I);
        addFeature(features, "speaker-mismatch-you-you", m1.person == Person.YOU && m2.person == Person.YOU);
        addFeature(features, "speaker-mismatch-we-we", m1.person == Person.WE && m2.person == Person.WE);
    }
    // other dcoref features
    String firstWord1 = firstWord(m1).word().toLowerCase();
    addFeature(features, "indefinite-article-np", (m1.appositions == null && m1.predicateNominatives == null && (firstWord1.equals("a") || firstWord1.equals("an"))));
    addFeature(features, "far-this", m2.lowercaseNormalizedSpanString().equals("this") && Math.abs(m2.sentNum - m1.sentNum) > 3);
    addFeature(features, "per0-you-in-article", m2.person == Person.YOU && doc.docType == DocType.ARTICLE && m2.headWord.get(CoreAnnotations.SpeakerAnnotation.class).equals("PER0"));
    addFeature(features, "inside-in", m2.insideIn(m1) || m1.insideIn(m2));
    addFeature(features, "indefinite-determiners", dictionaries.indefinitePronouns.contains(m1.originalSpan.get(0).lemma()) || dictionaries.indefinitePronouns.contains(m2.originalSpan.get(0).lemma()));
    addFeature(features, "entity-attributes-agree", CorefRules.entityAttributesAgree(c2, c1));
    addFeature(features, "entity-token-distance", CorefRules.entityTokenDistance(m2, m1));
    addFeature(features, "i-within-i", CorefRules.entityIWithinI(m2, m1, dictionaries));
    addFeature(features, "exact-string-match", CorefRules.entityExactStringMatch(c2, c1, dictionaries, doc.roleSet));
    addFeature(features, "entity-relaxed-heads-agree", CorefRules.entityRelaxedHeadsAgreeBetweenMentions(c2, c1, m2, m1));
    addFeature(features, "is-acronym", CorefRules.entityIsAcronym(doc, c2, c1));
    addFeature(features, "demonym", m2.isDemonym(m1, dictionaries));
    addFeature(features, "incompatible-modifier", CorefRules.entityHaveIncompatibleModifier(m2, m1));
    addFeature(features, "head-lemma-match", m1.headWord.lemma().equals(m2.headWord.lemma()));
    addFeature(features, "words-included", CorefRules.entityWordsIncluded(c2, c1, m2, m1));
    addFeature(features, "extra-proper-noun", CorefRules.entityHaveExtraProperNoun(m2, m1, new HashSet<>()));
    addFeature(features, "number-in-later-mentions", CorefRules.entityNumberInLaterMention(m2, m1));
    addFeature(features, "sentence-context-incompatible", CorefRules.sentenceContextIncompatible(m2, m1, dictionaries));
    // syntax features
    if (useConstituencyParse) {
        if (m1.sentNum == m2.sentNum) {
            int clauseCount = 0;
            Tree tree = m2.contextParseTree;
            Tree current = m2.mentionSubTree;
            while (true) {
                current = current.ancestor(1, tree);
                if (current.label().value().startsWith("S")) {
                    clauseCount++;
                }
                if (current.dominates(m1.mentionSubTree)) {
                    break;
                }
                if (current.label().value().equals("ROOT") || current.ancestor(1, tree) == null) {
                    break;
                }
            }
            features.incrementCount("clause-count", clauseCount);
            features.incrementCount("clause-count=" + bin(clauseCount));
        }
        if (RuleBasedCorefMentionFinder.isPleonastic(m2, m2.contextParseTree) || RuleBasedCorefMentionFinder.isPleonastic(m1, m1.contextParseTree)) {
            features.incrementCount("pleonastic-it");
        }
        if (maximalNp(m1.mentionSubTree) == maximalNp(m2.mentionSubTree)) {
            features.incrementCount("same-maximal-np");
        }
        boolean m1Embedded = headEmbeddingLevel(m1.mentionSubTree, m1.headIndex - m1.startIndex) > 1;
        boolean m2Embedded = headEmbeddingLevel(m2.mentionSubTree, m2.headIndex - m2.startIndex) > 1;
        features.incrementCount("embedding=" + m1Embedded + "_" + m2Embedded);
    }
    return features;
}
Also used : SemanticGraphEdge(edu.stanford.nlp.semgraph.SemanticGraphEdge) CoreLabel(edu.stanford.nlp.ling.CoreLabel) Random(java.util.Random) CorefCluster(edu.stanford.nlp.coref.data.CorefCluster) Mention(edu.stanford.nlp.coref.data.Mention) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) Tree(edu.stanford.nlp.trees.Tree) HashMap(java.util.HashMap) Map(java.util.Map) HashSet(java.util.HashSet) SpeakerAnnotation(edu.stanford.nlp.ling.CoreAnnotations.SpeakerAnnotation)

Example 4 with CorefCluster

use of edu.stanford.nlp.coref.data.CorefCluster in project CoreNLP by stanfordnlp.

the class RFSieve method extractDatum.

public static RVFDatum<Boolean, String> extractDatum(Mention m, Mention candidate, Document document, int mentionDist, Dictionaries dict, Properties props, String sievename) {
    try {
        boolean label = (document.goldMentions == null) ? false : document.isCoref(m, candidate);
        Counter<String> features = new ClassicCounter<>();
        CorefCluster mC = document.corefClusters.get(m.corefClusterID);
        CorefCluster aC = document.corefClusters.get(candidate.corefClusterID);
        CoreLabel mFirst = m.sentenceWords.get(m.startIndex);
        CoreLabel mLast = m.sentenceWords.get(m.endIndex - 1);
        CoreLabel mPreceding = (m.startIndex > 0) ? m.sentenceWords.get(m.startIndex - 1) : null;
        CoreLabel mFollowing = (m.endIndex < m.sentenceWords.size()) ? m.sentenceWords.get(m.endIndex) : null;
        CoreLabel aFirst = candidate.sentenceWords.get(candidate.startIndex);
        CoreLabel aLast = candidate.sentenceWords.get(candidate.endIndex - 1);
        CoreLabel aPreceding = (candidate.startIndex > 0) ? candidate.sentenceWords.get(candidate.startIndex - 1) : null;
        CoreLabel aFollowing = (candidate.endIndex < candidate.sentenceWords.size()) ? candidate.sentenceWords.get(candidate.endIndex) : null;
        // //////////////////////////////////////////////////////////////////////////////
        if (HybridCorefProperties.useBasicFeatures(props, sievename)) {
            int sentDist = m.sentNum - candidate.sentNum;
            features.incrementCount("SENTDIST", sentDist);
            features.incrementCount("MENTIONDIST", mentionDist);
            int minSentDist = sentDist;
            for (Mention a : aC.corefMentions) {
                minSentDist = Math.min(minSentDist, Math.abs(m.sentNum - a.sentNum));
            }
            features.incrementCount("MINSENTDIST", minSentDist);
            // When they are in the same sentence, divides a sentence into clauses and add such feature
            if (CorefProperties.useConstituencyParse(props)) {
                if (m.sentNum == candidate.sentNum) {
                    int clauseCount = 0;
                    Tree tree = m.contextParseTree;
                    Tree current = m.mentionSubTree;
                    while (true) {
                        current = current.ancestor(1, tree);
                        if (current.label().value().startsWith("S")) {
                            clauseCount++;
                        }
                        if (current.dominates(candidate.mentionSubTree))
                            break;
                        if (current.label().value().equals("ROOT") || current.ancestor(1, tree) == null)
                            break;
                    }
                    features.incrementCount("CLAUSECOUNT", clauseCount);
                }
            }
            if (document.docType == DocType.CONVERSATION)
                features.incrementCount("B-DOCTYPE-" + document.docType);
            if (m.headWord.get(SpeakerAnnotation.class).equalsIgnoreCase("PER0")) {
                features.incrementCount("B-SPEAKER-PER0");
            }
            if (document.docInfo != null && document.docInfo.containsKey("DOC_ID")) {
                features.incrementCount("B-DOCSOURCE-" + document.docInfo.get("DOC_ID").split("/")[1]);
            }
            features.incrementCount("M-LENGTH", m.originalSpan.size());
            features.incrementCount("A-LENGTH", candidate.originalSpan.size());
            if (m.originalSpan.size() < candidate.originalSpan.size())
                features.incrementCount("B-A-ISLONGER");
            features.incrementCount("A-SIZE", aC.getCorefMentions().size());
            features.incrementCount("M-SIZE", mC.getCorefMentions().size());
            String antRole = "A-NOROLE";
            String mRole = "M-NOROLE";
            if (m.isSubject)
                mRole = "M-SUBJ";
            if (m.isDirectObject)
                mRole = "M-DOBJ";
            if (m.isIndirectObject)
                mRole = "M-IOBJ";
            if (m.isPrepositionObject)
                mRole = "M-POBJ";
            if (candidate.isSubject)
                antRole = "A-SUBJ";
            if (candidate.isDirectObject)
                antRole = "A-DOBJ";
            if (candidate.isIndirectObject)
                antRole = "A-IOBJ";
            if (candidate.isPrepositionObject)
                antRole = "A-POBJ";
            features.incrementCount("B-" + mRole);
            features.incrementCount("B-" + antRole);
            features.incrementCount("B-" + antRole + "-" + mRole);
            if (HybridCorefProperties.combineObjectRoles(props, sievename)) {
                // combine all objects
                if (m.isDirectObject || m.isIndirectObject || m.isPrepositionObject || candidate.isDirectObject || candidate.isIndirectObject || candidate.isPrepositionObject) {
                    if (m.isDirectObject || m.isIndirectObject || m.isPrepositionObject) {
                        mRole = "M-OBJ";
                        features.incrementCount("B-M-OBJ");
                    }
                    if (candidate.isDirectObject || candidate.isIndirectObject || candidate.isPrepositionObject) {
                        antRole = "A-OBJ";
                        features.incrementCount("B-A-OBJ");
                    }
                    features.incrementCount("B-" + antRole + "-" + mRole);
                }
            }
            if (mFirst.word().toLowerCase().matches("a|an")) {
                features.incrementCount("B-M-START-WITH-INDEFINITE");
            }
            if (aFirst.word().toLowerCase().matches("a|an")) {
                features.incrementCount("B-A-START-WITH-INDEFINITE");
            }
            if (mFirst.word().equalsIgnoreCase("the")) {
                features.incrementCount("B-M-START-WITH-DEFINITE");
            }
            if (aFirst.word().equalsIgnoreCase("the")) {
                features.incrementCount("B-A-START-WITH-DEFINITE");
            }
            if (dict.indefinitePronouns.contains(m.lowercaseNormalizedSpanString())) {
                features.incrementCount("B-M-INDEFINITE-PRONOUN");
            }
            if (dict.indefinitePronouns.contains(candidate.lowercaseNormalizedSpanString())) {
                features.incrementCount("B-A-INDEFINITE-PRONOUN");
            }
            if (dict.indefinitePronouns.contains(mFirst.word().toLowerCase())) {
                features.incrementCount("B-M-INDEFINITE-ADJ");
            }
            if (dict.indefinitePronouns.contains(aFirst.word().toLowerCase())) {
                features.incrementCount("B-A-INDEFINITE-ADJ");
            }
            if (dict.reflexivePronouns.contains(m.headString)) {
                features.incrementCount("B-M-REFLEXIVE");
            }
            if (dict.reflexivePronouns.contains(candidate.headString)) {
                features.incrementCount("B-A-REFLEXIVE");
            }
            if (m.headIndex == m.endIndex - 1)
                features.incrementCount("B-M-HEADEND");
            if (m.headIndex < m.endIndex - 1) {
                CoreLabel headnext = m.sentenceWords.get(m.headIndex + 1);
                if (headnext.word().matches("that|,") || headnext.tag().startsWith("W")) {
                    features.incrementCount("B-M-HASPOSTPHRASE");
                    if (mFirst.tag().equals("DT") && mFirst.word().toLowerCase().matches("the|this|these|those"))
                        features.incrementCount("B-M-THE-HASPOSTPHRASE");
                    else if (mFirst.word().toLowerCase().matches("a|an"))
                        features.incrementCount("B-M-INDEFINITE-HASPOSTPHRASE");
                }
            }
            // shape feature from Bjorkelund & Kuhn
            StringBuilder sb = new StringBuilder();
            List<Mention> sortedMentions = new ArrayList<>(aC.corefMentions.size());
            sortedMentions.addAll(aC.corefMentions);
            sortedMentions.sort(new CorefChain.MentionComparator());
            for (Mention a : sortedMentions) {
                sb.append(a.mentionType).append("-");
            }
            features.incrementCount("B-A-SHAPE-" + sb);
            sb = new StringBuilder();
            sortedMentions = new ArrayList<>(mC.corefMentions.size());
            sortedMentions.addAll(mC.corefMentions);
            sortedMentions.sort(new CorefChain.MentionComparator());
            for (Mention men : sortedMentions) {
                sb.append(men.mentionType).append("-");
            }
            features.incrementCount("B-M-SHAPE-" + sb);
            if (CorefProperties.useConstituencyParse(props)) {
                sb = new StringBuilder();
                Tree mTree = m.contextParseTree;
                Tree mHead = mTree.getLeaves().get(m.headIndex).ancestor(1, mTree);
                for (Tree node : mTree.pathNodeToNode(mHead, mTree)) {
                    sb.append(node.value()).append("-");
                    if (node.value().equals("S"))
                        break;
                }
                features.incrementCount("B-M-SYNPATH-" + sb);
                sb = new StringBuilder();
                Tree aTree = candidate.contextParseTree;
                Tree aHead = aTree.getLeaves().get(candidate.headIndex).ancestor(1, aTree);
                for (Tree node : aTree.pathNodeToNode(aHead, aTree)) {
                    sb.append(node.value()).append("-");
                    if (node.value().equals("S"))
                        break;
                }
                features.incrementCount("B-A-SYNPATH-" + sb);
            }
            features.incrementCount("A-FIRSTAPPEAR", aC.representative.sentNum);
            features.incrementCount("M-FIRSTAPPEAR", mC.representative.sentNum);
            // document size in # of sentences
            int docSize = document.predictedMentions.size();
            features.incrementCount("A-FIRSTAPPEAR-NORMALIZED", aC.representative.sentNum / docSize);
            features.incrementCount("M-FIRSTAPPEAR-NORMALIZED", mC.representative.sentNum / docSize);
        }
        // //////////////////////////////////////////////////////////////////////////////
        if (HybridCorefProperties.useMentionDetectionFeatures(props, sievename)) {
            // bare plurals
            if (m.originalSpan.size() == 1 && m.headWord.tag().equals("NNS"))
                features.incrementCount("B-M-BAREPLURAL");
            if (candidate.originalSpan.size() == 1 && candidate.headWord.tag().equals("NNS"))
                features.incrementCount("B-A-BAREPLURAL");
            // pleonastic it
            if (CorefProperties.useConstituencyParse(props)) {
                if (RuleBasedCorefMentionFinder.isPleonastic(m, m.contextParseTree) || RuleBasedCorefMentionFinder.isPleonastic(candidate, candidate.contextParseTree)) {
                    features.incrementCount("B-PLEONASTICIT");
                }
            }
            // quantRule
            if (dict.quantifiers.contains(mFirst.word().toLowerCase(Locale.ENGLISH)))
                features.incrementCount("B-M-QUANTIFIER");
            if (dict.quantifiers.contains(aFirst.word().toLowerCase(Locale.ENGLISH)))
                features.incrementCount("B-A-QUANTIFIER");
            // starts with negation
            if (mFirst.word().toLowerCase(Locale.ENGLISH).matches("none|no|nothing|not") || aFirst.word().toLowerCase(Locale.ENGLISH).matches("none|no|nothing|not")) {
                features.incrementCount("B-NEGATIVE-START");
            }
            // parititive rule
            if (RuleBasedCorefMentionFinder.partitiveRule(m, m.sentenceWords, dict))
                features.incrementCount("B-M-PARTITIVE");
            if (RuleBasedCorefMentionFinder.partitiveRule(candidate, candidate.sentenceWords, dict))
                features.incrementCount("B-A-PARTITIVE");
            // %
            if (m.headString.equals("%"))
                features.incrementCount("B-M-HEAD%");
            if (candidate.headString.equals("%"))
                features.incrementCount("B-A-HEAD%");
            // adjective form of nations
            if (dict.isAdjectivalDemonym(m.spanToString()))
                features.incrementCount("B-M-ADJ-DEMONYM");
            if (dict.isAdjectivalDemonym(candidate.spanToString()))
                features.incrementCount("B-A-ADJ-DEMONYM");
            // ends with "etc."
            if (m.lowercaseNormalizedSpanString().endsWith("etc."))
                features.incrementCount("B-M-ETC-END");
            if (candidate.lowercaseNormalizedSpanString().endsWith("etc."))
                features.incrementCount("B-A-ETC-END");
        }
        // //////////////////////////////////////////////////////////////////////////////
        // /////    attributes, attributes agree                             ////////////
        // //////////////////////////////////////////////////////////////////////////////
        features.incrementCount("B-M-NUMBER-" + m.number);
        features.incrementCount("B-A-NUMBER-" + candidate.number);
        features.incrementCount("B-M-GENDER-" + m.gender);
        features.incrementCount("B-A-GENDER-" + candidate.gender);
        features.incrementCount("B-M-ANIMACY-" + m.animacy);
        features.incrementCount("B-A-ANIMACY-" + candidate.animacy);
        features.incrementCount("B-M-PERSON-" + m.person);
        features.incrementCount("B-A-PERSON-" + candidate.person);
        features.incrementCount("B-M-NETYPE-" + m.nerString);
        features.incrementCount("B-A-NETYPE-" + candidate.nerString);
        features.incrementCount("B-BOTH-NUMBER-" + candidate.number + "-" + m.number);
        features.incrementCount("B-BOTH-GENDER-" + candidate.gender + "-" + m.gender);
        features.incrementCount("B-BOTH-ANIMACY-" + candidate.animacy + "-" + m.animacy);
        features.incrementCount("B-BOTH-PERSON-" + candidate.person + "-" + m.person);
        features.incrementCount("B-BOTH-NETYPE-" + candidate.nerString + "-" + m.nerString);
        Set<Number> mcNumber = Generics.newHashSet();
        for (Number n : mC.numbers) {
            features.incrementCount("B-MC-NUMBER-" + n);
            mcNumber.add(n);
        }
        if (mcNumber.size() == 1) {
            features.incrementCount("B-MC-CLUSTERNUMBER-" + mcNumber.iterator().next());
        } else {
            mcNumber.remove(Number.UNKNOWN);
            if (mcNumber.size() == 1)
                features.incrementCount("B-MC-CLUSTERNUMBER-" + mcNumber.iterator().next());
            else
                features.incrementCount("B-MC-CLUSTERNUMBER-CONFLICT");
        }
        Set<Gender> mcGender = Generics.newHashSet();
        for (Gender g : mC.genders) {
            features.incrementCount("B-MC-GENDER-" + g);
            mcGender.add(g);
        }
        if (mcGender.size() == 1) {
            features.incrementCount("B-MC-CLUSTERGENDER-" + mcGender.iterator().next());
        } else {
            mcGender.remove(Gender.UNKNOWN);
            if (mcGender.size() == 1)
                features.incrementCount("B-MC-CLUSTERGENDER-" + mcGender.iterator().next());
            else
                features.incrementCount("B-MC-CLUSTERGENDER-CONFLICT");
        }
        Set<Animacy> mcAnimacy = Generics.newHashSet();
        for (Animacy a : mC.animacies) {
            features.incrementCount("B-MC-ANIMACY-" + a);
            mcAnimacy.add(a);
        }
        if (mcAnimacy.size() == 1) {
            features.incrementCount("B-MC-CLUSTERANIMACY-" + mcAnimacy.iterator().next());
        } else {
            mcAnimacy.remove(Animacy.UNKNOWN);
            if (mcAnimacy.size() == 1)
                features.incrementCount("B-MC-CLUSTERANIMACY-" + mcAnimacy.iterator().next());
            else
                features.incrementCount("B-MC-CLUSTERANIMACY-CONFLICT");
        }
        Set<String> mcNER = Generics.newHashSet();
        for (String t : mC.nerStrings) {
            features.incrementCount("B-MC-NETYPE-" + t);
            mcNER.add(t);
        }
        if (mcNER.size() == 1) {
            features.incrementCount("B-MC-CLUSTERNETYPE-" + mcNER.iterator().next());
        } else {
            mcNER.remove("O");
            if (mcNER.size() == 1)
                features.incrementCount("B-MC-CLUSTERNETYPE-" + mcNER.iterator().next());
            else
                features.incrementCount("B-MC-CLUSTERNETYPE-CONFLICT");
        }
        Set<Number> acNumber = Generics.newHashSet();
        for (Number n : aC.numbers) {
            features.incrementCount("B-AC-NUMBER-" + n);
            acNumber.add(n);
        }
        if (acNumber.size() == 1) {
            features.incrementCount("B-AC-CLUSTERNUMBER-" + acNumber.iterator().next());
        } else {
            acNumber.remove(Number.UNKNOWN);
            if (acNumber.size() == 1)
                features.incrementCount("B-AC-CLUSTERNUMBER-" + acNumber.iterator().next());
            else
                features.incrementCount("B-AC-CLUSTERNUMBER-CONFLICT");
        }
        Set<Gender> acGender = Generics.newHashSet();
        for (Gender g : aC.genders) {
            features.incrementCount("B-AC-GENDER-" + g);
            acGender.add(g);
        }
        if (acGender.size() == 1) {
            features.incrementCount("B-AC-CLUSTERGENDER-" + acGender.iterator().next());
        } else {
            acGender.remove(Gender.UNKNOWN);
            if (acGender.size() == 1)
                features.incrementCount("B-AC-CLUSTERGENDER-" + acGender.iterator().next());
            else
                features.incrementCount("B-AC-CLUSTERGENDER-CONFLICT");
        }
        Set<Animacy> acAnimacy = Generics.newHashSet();
        for (Animacy a : aC.animacies) {
            features.incrementCount("B-AC-ANIMACY-" + a);
            acAnimacy.add(a);
        }
        if (acAnimacy.size() == 1) {
            features.incrementCount("B-AC-CLUSTERANIMACY-" + acAnimacy.iterator().next());
        } else {
            acAnimacy.remove(Animacy.UNKNOWN);
            if (acAnimacy.size() == 1)
                features.incrementCount("B-AC-CLUSTERANIMACY-" + acAnimacy.iterator().next());
            else
                features.incrementCount("B-AC-CLUSTERANIMACY-CONFLICT");
        }
        Set<String> acNER = Generics.newHashSet();
        for (String t : aC.nerStrings) {
            features.incrementCount("B-AC-NETYPE-" + t);
            acNER.add(t);
        }
        if (acNER.size() == 1) {
            features.incrementCount("B-AC-CLUSTERNETYPE-" + acNER.iterator().next());
        } else {
            acNER.remove("O");
            if (acNER.size() == 1)
                features.incrementCount("B-AC-CLUSTERNETYPE-" + acNER.iterator().next());
            else
                features.incrementCount("B-AC-CLUSTERNETYPE-CONFLICT");
        }
        if (m.numbersAgree(candidate))
            features.incrementCount("B-NUMBER-AGREE");
        if (m.gendersAgree(candidate))
            features.incrementCount("B-GENDER-AGREE");
        if (m.animaciesAgree(candidate))
            features.incrementCount("B-ANIMACY-AGREE");
        if (CorefRules.entityAttributesAgree(mC, aC))
            features.incrementCount("B-ATTRIBUTES-AGREE");
        if (CorefRules.entityPersonDisagree(document, m, candidate, dict))
            features.incrementCount("B-PERSON-DISAGREE");
        // //////////////////////////////////////////////////////////////////////////////
        if (HybridCorefProperties.useDcorefRules(props, sievename)) {
            if (CorefRules.entityIWithinI(m, candidate, dict))
                features.incrementCount("B-i-within-i");
            if (CorefRules.antecedentIsMentionSpeaker(document, m, candidate, dict))
                features.incrementCount("B-ANT-IS-SPEAKER");
            if (CorefRules.entitySameSpeaker(document, m, candidate))
                features.incrementCount("B-SAME-SPEAKER");
            if (CorefRules.entitySubjectObject(m, candidate))
                features.incrementCount("B-SUBJ-OBJ");
            for (Mention a : aC.corefMentions) {
                if (CorefRules.entitySubjectObject(m, a))
                    features.incrementCount("B-CLUSTER-SUBJ-OBJ");
            }
            if (CorefRules.entityPersonDisagree(document, m, candidate, dict) && CorefRules.entitySameSpeaker(document, m, candidate))
                features.incrementCount("B-PERSON-DISAGREE-SAME-SPEAKER");
            if (CorefRules.entityIWithinI(mC, aC, dict))
                features.incrementCount("B-ENTITY-IWITHINI");
            if (CorefRules.antecedentMatchesMentionSpeakerAnnotation(m, candidate, document))
                features.incrementCount("B-ANT-IS-SPEAKER-OF-MENTION");
            Set<MentionType> mType = HybridCorefProperties.getMentionType(props, sievename);
            if (mType.contains(MentionType.PROPER) || mType.contains(MentionType.NOMINAL)) {
                if (m.headString.equals(candidate.headString))
                    features.incrementCount("B-HEADMATCH");
                if (CorefRules.entityHeadsAgree(mC, aC, m, candidate, dict))
                    features.incrementCount("B-HEADSAGREE");
                if (CorefRules.entityExactStringMatch(mC, aC, dict, document.roleSet))
                    features.incrementCount("B-EXACTSTRINGMATCH");
                if (CorefRules.entityHaveExtraProperNoun(m, candidate, new HashSet<>()))
                    features.incrementCount("B-HAVE-EXTRA-PROPER-NOUN");
                if (CorefRules.entityBothHaveProper(mC, aC))
                    features.incrementCount("B-BOTH-HAVE-PROPER");
                if (CorefRules.entityHaveDifferentLocation(m, candidate, dict))
                    features.incrementCount("B-HAVE-DIFF-LOC");
                if (CorefRules.entityHaveIncompatibleModifier(mC, aC))
                    features.incrementCount("B-HAVE-INCOMPATIBLE-MODIFIER");
                if (CorefRules.entityIsAcronym(document, mC, aC))
                    features.incrementCount("B-IS-ACRONYM");
                if (CorefRules.entityIsApposition(mC, aC, m, candidate))
                    features.incrementCount("B-IS-APPOSITION");
                if (CorefRules.entityIsPredicateNominatives(mC, aC, m, candidate))
                    features.incrementCount("B-IS-PREDICATE-NOMINATIVES");
                if (CorefRules.entityIsRoleAppositive(mC, aC, m, candidate, dict))
                    features.incrementCount("B-IS-ROLE-APPOSITIVE");
                if (CorefRules.entityNumberInLaterMention(m, candidate))
                    features.incrementCount("B-NUMBER-IN-LATER");
                if (CorefRules.entityRelaxedExactStringMatch(mC, aC, m, candidate, dict, document.roleSet))
                    features.incrementCount("B-RELAXED-EXACT-STRING-MATCH");
                if (CorefRules.entityRelaxedHeadsAgreeBetweenMentions(mC, aC, m, candidate))
                    features.incrementCount("B-RELAXED-HEAD-AGREE");
                if (CorefRules.entitySameProperHeadLastWord(m, candidate))
                    features.incrementCount("B-SAME-PROPER-HEAD");
                if (CorefRules.entitySameProperHeadLastWord(mC, aC, m, candidate))
                    features.incrementCount("B-CLUSTER-SAME-PROPER-HEAD");
                if (CorefRules.entityWordsIncluded(mC, aC, m, candidate))
                    features.incrementCount("B-WORD-INCLUSION");
            }
            if (mType.contains(MentionType.LIST)) {
                features.incrementCount("NUM-LIST-", numEntitiesInList(m));
                if (m.spanToString().contains("two") || m.spanToString().contains("2") || m.spanToString().contains("both"))
                    features.incrementCount("LIST-M-TWO");
                if (m.spanToString().contains("three") || m.spanToString().contains("3"))
                    features.incrementCount("LIST-M-THREE");
                if (candidate.spanToString().contains("two") || candidate.spanToString().contains("2") || candidate.spanToString().contains("both")) {
                    features.incrementCount("B-LIST-A-TWO");
                }
                if (candidate.spanToString().contains("three") || candidate.spanToString().contains("3")) {
                    features.incrementCount("B-LIST-A-THREE");
                }
            }
            if (mType.contains(MentionType.PRONOMINAL)) {
                if (dict.firstPersonPronouns.contains(m.headString))
                    features.incrementCount("B-M-I");
                if (dict.secondPersonPronouns.contains(m.headString))
                    features.incrementCount("B-M-YOU");
                if (dict.thirdPersonPronouns.contains(m.headString))
                    features.incrementCount("B-M-3RDPERSON");
                if (dict.possessivePronouns.contains(m.headString))
                    features.incrementCount("B-M-POSSESSIVE");
                if (dict.neutralPronouns.contains(m.headString))
                    features.incrementCount("B-M-NEUTRAL");
                if (dict.malePronouns.contains(m.headString))
                    features.incrementCount("B-M-MALE");
                if (dict.femalePronouns.contains(m.headString))
                    features.incrementCount("B-M-FEMALE");
                if (dict.firstPersonPronouns.contains(candidate.headString))
                    features.incrementCount("B-A-I");
                if (dict.secondPersonPronouns.contains(candidate.headString))
                    features.incrementCount("B-A-YOU");
                if (dict.thirdPersonPronouns.contains(candidate.headString))
                    features.incrementCount("B-A-3RDPERSON");
                if (dict.possessivePronouns.contains(candidate.headString))
                    features.incrementCount("B-A-POSSESSIVE");
                if (dict.neutralPronouns.contains(candidate.headString))
                    features.incrementCount("B-A-NEUTRAL");
                if (dict.malePronouns.contains(candidate.headString))
                    features.incrementCount("B-A-MALE");
                if (dict.femalePronouns.contains(candidate.headString))
                    features.incrementCount("B-A-FEMALE");
                features.incrementCount("B-M-GENERIC-" + m.generic);
                features.incrementCount("B-A-GENERIC-" + candidate.generic);
                if (HybridCorefPrinter.dcorefPronounSieve.skipThisMention(document, m, mC, dict)) {
                    features.incrementCount("B-SKIPTHISMENTION-true");
                }
                if (m.spanToString().equalsIgnoreCase("you") && mFollowing != null && mFollowing.word().equalsIgnoreCase("know")) {
                    features.incrementCount("B-YOUKNOW-PRECEDING-POS-" + ((mPreceding == null) ? "NULL" : mPreceding.tag()));
                    features.incrementCount("B-YOUKNOW-PRECEDING-WORD-" + ((mPreceding == null) ? "NULL" : mPreceding.word().toLowerCase()));
                    CoreLabel nextword = (m.endIndex + 1 < m.sentenceWords.size()) ? m.sentenceWords.get(m.endIndex + 1) : null;
                    features.incrementCount("B-YOUKNOW-FOLLOWING-POS-" + ((nextword == null) ? "NULL" : nextword.tag()));
                    features.incrementCount("B-YOUKNOW-FOLLOWING-WORD-" + ((nextword == null) ? "NULL" : nextword.word().toLowerCase()));
                }
                if (candidate.spanToString().equalsIgnoreCase("you") && aFollowing != null && aFollowing.word().equalsIgnoreCase("know")) {
                    features.incrementCount("B-YOUKNOW-PRECEDING-POS-" + ((aPreceding == null) ? "NULL" : aPreceding.tag()));
                    features.incrementCount("B-YOUKNOW-PRECEDING-WORD-" + ((aPreceding == null) ? "NULL" : aPreceding.word().toLowerCase()));
                    CoreLabel nextword = (candidate.endIndex + 1 < candidate.sentenceWords.size()) ? candidate.sentenceWords.get(candidate.endIndex + 1) : null;
                    features.incrementCount("B-YOUKNOW-FOLLOWING-POS-" + ((nextword == null) ? "NULL" : nextword.tag()));
                    features.incrementCount("B-YOUKNOW-FOLLOWING-WORD-" + ((nextword == null) ? "NULL" : nextword.word().toLowerCase()));
                }
            }
            // discourse match features
            if (m.person == Person.YOU && document.docType == DocType.ARTICLE && m.headWord.get(CoreAnnotations.SpeakerAnnotation.class).equals("PER0")) {
                features.incrementCount("B-DISCOURSE-M-YOU-GENERIC?");
            }
            if (candidate.generic && candidate.person == Person.YOU)
                features.incrementCount("B-DISCOURSE-A-YOU-GENERIC?");
            String mString = m.lowercaseNormalizedSpanString();
            String antString = candidate.lowercaseNormalizedSpanString();
            // I-I
            if (m.number == Number.SINGULAR && dict.firstPersonPronouns.contains(mString) && candidate.number == Number.SINGULAR && dict.firstPersonPronouns.contains(antString) && CorefRules.entitySameSpeaker(document, m, candidate)) {
                features.incrementCount("B-DISCOURSE-I-I-SAMESPEAKER");
            }
            // (speaker - I)
            if ((m.number == Number.SINGULAR && dict.firstPersonPronouns.contains(mString)) && CorefRules.antecedentIsMentionSpeaker(document, m, candidate, dict)) {
                features.incrementCount("B-DISCOURSE-SPEAKER-I");
            }
            // (I - speaker)
            if ((candidate.number == Number.SINGULAR && dict.firstPersonPronouns.contains(antString)) && CorefRules.antecedentIsMentionSpeaker(document, candidate, m, dict)) {
                features.incrementCount("B-DISCOURSE-I-SPEAKER");
            }
            // Can be iffy if more than two speakers... but still should be okay most of the time
            if (dict.secondPersonPronouns.contains(mString) && dict.secondPersonPronouns.contains(antString) && CorefRules.entitySameSpeaker(document, m, candidate)) {
                features.incrementCount("B-DISCOURSE-BOTH-YOU");
            }
            // previous I - you or previous you - I in two person conversation
            if (((m.person == Person.I && candidate.person == Person.YOU || (m.person == Person.YOU && candidate.person == Person.I)) && (m.headWord.get(CoreAnnotations.UtteranceAnnotation.class) - candidate.headWord.get(CoreAnnotations.UtteranceAnnotation.class) == 1) && document.docType == DocType.CONVERSATION)) {
                features.incrementCount("B-DISCOURSE-I-YOU");
            }
            if (dict.reflexivePronouns.contains(m.headString) && CorefRules.entitySubjectObject(m, candidate)) {
                features.incrementCount("B-DISCOURSE-REFLEXIVE");
            }
            if (m.person == Person.I && candidate.person == Person.I && !CorefRules.entitySameSpeaker(document, m, candidate)) {
                features.incrementCount("B-DISCOURSE-I-I-DIFFSPEAKER");
            }
            if (m.person == Person.YOU && candidate.person == Person.YOU && !CorefRules.entitySameSpeaker(document, m, candidate)) {
                features.incrementCount("B-DISCOURSE-YOU-YOU-DIFFSPEAKER");
            }
            if (m.person == Person.WE && candidate.person == Person.WE && !CorefRules.entitySameSpeaker(document, m, candidate)) {
                features.incrementCount("B-DISCOURSE-WE-WE-DIFFSPEAKER");
            }
        }
        // //////////////////////////////////////////////////////////////////////////////
        if (HybridCorefProperties.usePOSFeatures(props, sievename)) {
            features.incrementCount("B-LEXICAL-M-HEADPOS-" + m.headWord.tag());
            features.incrementCount("B-LEXICAL-A-HEADPOS-" + candidate.headWord.tag());
            features.incrementCount("B-LEXICAL-M-FIRSTPOS-" + mFirst.tag());
            features.incrementCount("B-LEXICAL-A-FIRSTPOS-" + aFirst.tag());
            features.incrementCount("B-LEXICAL-M-LASTPOS-" + mLast.tag());
            features.incrementCount("B-LEXICAL-A-LASTPOS-" + aLast.tag());
            features.incrementCount("B-LEXICAL-M-PRECEDINGPOS-" + ((mPreceding == null) ? "NULL" : mPreceding.tag()));
            features.incrementCount("B-LEXICAL-A-PRECEDINGPOS-" + ((aPreceding == null) ? "NULL" : aPreceding.tag()));
            features.incrementCount("B-LEXICAL-M-FOLLOWINGPOS-" + ((mFollowing == null) ? "NULL" : mFollowing.tag()));
            features.incrementCount("B-LEXICAL-A-FOLLOWINGPOS-" + ((aFollowing == null) ? "NULL" : aFollowing.tag()));
        }
        // //////////////////////////////////////////////////////////////////////////////
        if (HybridCorefProperties.useLexicalFeatures(props, sievename)) {
            features.incrementCount("B-LEXICAL-M-HEADWORD-" + m.headString.toLowerCase());
            features.incrementCount("B-LEXICAL-A-HEADWORD-" + candidate.headString.toLowerCase());
            features.incrementCount("B-LEXICAL-M-FIRSTWORD-" + mFirst.word().toLowerCase());
            features.incrementCount("B-LEXICAL-A-FIRSTWORD-" + aFirst.word().toLowerCase());
            features.incrementCount("B-LEXICAL-M-LASTWORD-" + mLast.word().toLowerCase());
            features.incrementCount("B-LEXICAL-A-LASTWORD-" + aLast.word().toLowerCase());
            features.incrementCount("B-LEXICAL-M-PRECEDINGWORD-" + ((mPreceding == null) ? "NULL" : mPreceding.word().toLowerCase()));
            features.incrementCount("B-LEXICAL-A-PRECEDINGWORD-" + ((aPreceding == null) ? "NULL" : aPreceding.word().toLowerCase()));
            features.incrementCount("B-LEXICAL-M-FOLLOWINGWORD-" + ((mFollowing == null) ? "NULL" : mFollowing.word().toLowerCase()));
            features.incrementCount("B-LEXICAL-A-FOLLOWINGWORD-" + ((aFollowing == null) ? "NULL" : aFollowing.word().toLowerCase()));
            // extra headword, modifiers lexical features
            for (String mHead : mC.heads) {
                if (!aC.heads.contains(mHead))
                    features.incrementCount("B-LEXICAL-MC-EXTRAHEAD-" + mHead);
            }
            for (String mWord : mC.words) {
                if (!aC.words.contains(mWord))
                    features.incrementCount("B-LEXICAL-MC-EXTRAWORD-" + mWord);
            }
        }
        // cosine
        if (HybridCorefProperties.useWordEmbedding(props, sievename)) {
            // dimension
            int dim = dict.vectors.entrySet().iterator().next().getValue().length;
            // distance between headword
            float[] mV = dict.vectors.get(m.headString.toLowerCase());
            float[] aV = dict.vectors.get(candidate.headString.toLowerCase());
            if (mV != null && aV != null) {
                features.incrementCount("WORDVECTOR-DIFF-HEADWORD", cosine(mV, aV));
            }
            mV = dict.vectors.get(mFirst.word().toLowerCase());
            aV = dict.vectors.get(aFirst.word().toLowerCase());
            if (mV != null && aV != null) {
                features.incrementCount("WORDVECTOR-DIFF-FIRSTWORD", cosine(mV, aV));
            }
            mV = dict.vectors.get(mLast.word().toLowerCase());
            aV = dict.vectors.get(aLast.word().toLowerCase());
            if (mV != null && aV != null) {
                features.incrementCount("WORDVECTOR-DIFF-LASTWORD", cosine(mV, aV));
            }
            if (mPreceding != null && aPreceding != null) {
                mV = dict.vectors.get(mPreceding.word().toLowerCase());
                aV = dict.vectors.get(aPreceding.word().toLowerCase());
                if (mV != null && aV != null) {
                    features.incrementCount("WORDVECTOR-DIFF-PRECEDINGWORD", cosine(mV, aV));
                }
            }
            if (mFollowing != null && aFollowing != null) {
                mV = dict.vectors.get(mFollowing.word().toLowerCase());
                aV = dict.vectors.get(aFollowing.word().toLowerCase());
                if (mV != null && aV != null) {
                    features.incrementCount("WORDVECTOR-DIFF-FOLLOWINGWORD", cosine(mV, aV));
                }
            }
            float[] aggreM = new float[dim];
            float[] aggreA = new float[dim];
            for (CoreLabel cl : m.originalSpan) {
                float[] v = dict.vectors.get(cl.word().toLowerCase());
                if (v == null)
                    continue;
                ArrayMath.pairwiseAddInPlace(aggreM, v);
            }
            for (CoreLabel cl : candidate.originalSpan) {
                float[] v = dict.vectors.get(cl.word().toLowerCase());
                if (v == null)
                    continue;
                ArrayMath.pairwiseAddInPlace(aggreA, v);
            }
            if (ArrayMath.L2Norm(aggreM) != 0 && ArrayMath.L2Norm(aggreA) != 0) {
                features.incrementCount("WORDVECTOR-AGGREGATE-DIFF", cosine(aggreM, aggreA));
            }
            int cnt = 0;
            double dist = 0;
            for (CoreLabel mcl : m.originalSpan) {
                for (CoreLabel acl : candidate.originalSpan) {
                    mV = dict.vectors.get(mcl.word().toLowerCase());
                    aV = dict.vectors.get(acl.word().toLowerCase());
                    if (mV == null || aV == null)
                        continue;
                    cnt++;
                    dist += cosine(mV, aV);
                }
            }
            features.incrementCount("WORDVECTOR-AVG-DIFF", dist / cnt);
        }
        return new RVFDatum<>(features, label);
    } catch (Exception e) {
        log.info("Datum Extraction failed in Sieve.java while processing document: " + document.docInfo.get("DOC_ID") + " part: " + document.docInfo.get("DOC_PART"));
        throw new RuntimeException(e);
    }
}
Also used : ArrayList(java.util.ArrayList) Gender(edu.stanford.nlp.coref.data.Dictionaries.Gender) Number(edu.stanford.nlp.coref.data.Dictionaries.Number) CorefChain(edu.stanford.nlp.coref.data.CorefChain) Mention(edu.stanford.nlp.coref.data.Mention) Tree(edu.stanford.nlp.trees.Tree) RVFDatum(edu.stanford.nlp.ling.RVFDatum) Animacy(edu.stanford.nlp.coref.data.Dictionaries.Animacy) MentionType(edu.stanford.nlp.coref.data.Dictionaries.MentionType) CoreLabel(edu.stanford.nlp.ling.CoreLabel) CorefCluster(edu.stanford.nlp.coref.data.CorefCluster) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) SpeakerAnnotation(edu.stanford.nlp.ling.CoreAnnotations.SpeakerAnnotation)

Example 5 with CorefCluster

use of edu.stanford.nlp.coref.data.CorefCluster in project CoreNLP by stanfordnlp.

the class HybridCorefPrinter method printErrorLogDcoref.

public static String printErrorLogDcoref(Mention m, Mention found, Document document, Dictionaries dict, int mIdx, String whichResolver) throws Exception {
    StringBuilder sb = new StringBuilder();
    sb.append("\nERROR START-----------------------------------------------------------------------\n");
    sb.append("RESOLVER TYPE: ").append(whichResolver).append("\n");
    sb.append("DOCUMENT: " + document.docInfo.get("DOC_ID") + ", " + document.docInfo.get("DOC_PART")).append("\n");
    List<Mention> orderedAnts = new ArrayList<>();
    sb.append("\nGOLD CLUSTER ID\n");
    for (int sentDist = m.sentNum; sentDist >= 0; sentDist--) {
        int sentIdx = m.sentNum - sentDist;
        sb.append("\tSENT " + sentIdx + "\t" + sentenceStringWithMention(sentIdx, document, true, true)).append("\n");
    }
    sb.append("\nMENTION ID\n");
    for (int sentDist = m.sentNum; sentDist >= 0; sentDist--) {
        int sentIdx = m.sentNum - sentDist;
        sb.append("\tSENT " + sentIdx + "\t" + sentenceStringWithMention(sentIdx, document, false, false)).append("\n");
    }
    // get dcoref antecedents ordering
    for (int sentDist = 0; sentDist <= m.sentNum; sentDist++) {
        int sentIdx = m.sentNum - sentDist;
        orderedAnts.addAll(Sieve.getOrderedAntecedents(m, sentIdx, mIdx, document.predictedMentions, dict));
    }
    Map<Integer, Integer> orders = Generics.newHashMap();
    for (int i = 0; i < orderedAnts.size(); i++) {
        Mention ant = orderedAnts.get(i);
        orders.put(ant.mentionID, i);
    }
    CorefCluster mC = document.corefClusters.get(m.corefClusterID);
    boolean isFirstMention = isFirstMention(m, document);
    // we're printing only mentions that found coref antecedent
    boolean foundCorefAnt = true;
    boolean correctDecision = document.isCoref(m, found);
    if (correctDecision)
        return "";
    sb.append("\nMENTION: " + m.spanToString() + " (" + m.mentionID + ")\tperson: " + m.person + "\tsingleton? " + (!m.hasTwin) + "\t\tisFirstMention? " + isFirstMention + "\t\tfoundAnt? " + foundCorefAnt + "\t\tcorrectDecision? " + correctDecision);
    sb.append("\n\ttype: " + m.mentionType + "\tHeadword: " + m.headWord.word() + "\tNEtype: " + m.nerString + "\tnumber: " + m.number + "\tgender: " + m.gender + "\tanimacy: " + m.animacy).append("\n");
    if (m.contextParseTree != null)
        sb.append(m.contextParseTree.pennString());
    sb.append("\n\n\t\tOracle\t\tDcoref\t\t\tRF\t\tAntecedent\n");
    for (Mention ant : orderedAnts) {
        int antID = ant.mentionID;
        CorefCluster aC = document.corefClusters.get(ant.corefClusterID);
        boolean oracle = Sieve.isReallyCoref(document, m.mentionID, antID);
        int order = orders.get(antID);
        String oracleStr = (oracle) ? "coref   " : "notcoref";
        // String dcorefStr = (dcoref)? "coref   " : "notcoref";
        String dcorefStr = "notcoref";
        if (dcorefSpeaker.coreferent(document, mC, aC, m, ant, dict, null))
            dcorefStr = "coref-speaker";
        else // else if(dcorefChineseHeadMatch.coreferent(document, mC, aC, m, ant, dict, null)) dcorefStr = "coref-chineseHeadMatch";
        if (dcorefDiscourse.coreferent(document, mC, aC, m, ant, dict, null))
            dcorefStr = "coref-discourse";
        else if (dcorefExactString.coreferent(document, mC, aC, m, ant, dict, null))
            dcorefStr = "coref-exactString";
        else if (dcorefRelaxedExactString.coreferent(document, mC, aC, m, ant, dict, null))
            dcorefStr = "coref-relaxedExact";
        else if (dcorefPreciseConstructs.coreferent(document, mC, aC, m, ant, dict, null))
            dcorefStr = "coref-preciseConstruct";
        else if (dcorefHead1.coreferent(document, mC, aC, m, ant, dict, null))
            dcorefStr = "coref-head1";
        else if (dcorefHead2.coreferent(document, mC, aC, m, ant, dict, null))
            dcorefStr = "coref-head2";
        else if (dcorefHead3.coreferent(document, mC, aC, m, ant, dict, null))
            dcorefStr = "coref-head3";
        else if (dcorefHead4.coreferent(document, mC, aC, m, ant, dict, null))
            dcorefStr = "coref-head4";
        else if (dcorefRelaxedHead.coreferent(document, mC, aC, m, ant, dict, null))
            dcorefStr = "coref-relaxedHead";
        else if (dcorefPronounSieve.coreferent(document, mC, aC, m, ant, dict, null))
            dcorefStr = "coref-pronounSieve";
        dcorefStr += "\t" + String.valueOf(order);
        sb.append("\t\t" + oracleStr + "\t" + dcorefStr + "\t\t" + ant.spanToString() + " (" + ant.mentionID + ")\n");
    }
    sb.append("ERROR END -----------------------------------------------------------------------\n");
    return sb.toString();
}
Also used : CorefCluster(edu.stanford.nlp.coref.data.CorefCluster) Mention(edu.stanford.nlp.coref.data.Mention) ArrayList(java.util.ArrayList)

Aggregations

CorefCluster (edu.stanford.nlp.coref.data.CorefCluster)15 Mention (edu.stanford.nlp.coref.data.Mention)12 ArrayList (java.util.ArrayList)6 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)4 CoreLabel (edu.stanford.nlp.ling.CoreLabel)4 HashMap (java.util.HashMap)4 Map (java.util.Map)4 CorefChain (edu.stanford.nlp.coref.data.CorefChain)3 Document (edu.stanford.nlp.coref.data.Document)3 SemanticGraphEdge (edu.stanford.nlp.semgraph.SemanticGraphEdge)3 Pair (edu.stanford.nlp.util.Pair)3 List (java.util.List)3 Dictionaries (edu.stanford.nlp.coref.data.Dictionaries)2 DocumentMaker (edu.stanford.nlp.coref.data.DocumentMaker)2 SpeakerAnnotation (edu.stanford.nlp.ling.CoreAnnotations.SpeakerAnnotation)2 ClassicCounter (edu.stanford.nlp.stats.ClassicCounter)2 Tree (edu.stanford.nlp.trees.Tree)2 CoreMap (edu.stanford.nlp.util.CoreMap)2 IOException (java.io.IOException)2 HashSet (java.util.HashSet)2