Search in sources :

Example 6 with Document

use of edu.stanford.nlp.coref.data.Document in project CoreNLP by stanfordnlp.

the class SingletonPredictor method generateFeatureVectors.

/**
   * Generate the training features from the CoNLL input file.
   * @return Dataset of feature vectors
   * @throws Exception
   */
private static GeneralDataset<String, String> generateFeatureVectors(Properties props) throws Exception {
    GeneralDataset<String, String> dataset = new Dataset<>();
    Dictionaries dict = new Dictionaries(props);
    DocumentMaker docMaker = new DocumentMaker(props, dict);
    Document document;
    while ((document = docMaker.nextDoc()) != null) {
        setTokenIndices(document);
        Map<Integer, CorefCluster> entities = document.goldCorefClusters;
        // Generate features for coreferent mentions with class label 1
        for (CorefCluster entity : entities.values()) {
            for (Mention mention : entity.getCorefMentions()) {
                // Ignore verbal mentions
                if (mention.headWord.tag().startsWith("V"))
                    continue;
                IndexedWord head = mention.enhancedDependency.getNodeByIndexSafe(mention.headWord.index());
                if (head == null)
                    continue;
                ArrayList<String> feats = mention.getSingletonFeatures(dict);
                dataset.add(new BasicDatum<>(feats, "1"));
            }
        }
        // Generate features for singletons with class label 0
        ArrayList<CoreLabel> gold_heads = new ArrayList<>();
        for (Mention gold_men : document.goldMentionsByID.values()) {
            gold_heads.add(gold_men.headWord);
        }
        for (Mention predicted_men : document.predictedMentionsByID.values()) {
            SemanticGraph dep = predicted_men.enhancedDependency;
            IndexedWord head = dep.getNodeByIndexSafe(predicted_men.headWord.index());
            if (head == null || !dep.vertexSet().contains(head))
                continue;
            // Ignore verbal mentions
            if (predicted_men.headWord.tag().startsWith("V"))
                continue;
            // If the mention is in the gold set, it is not a singleton and thus ignore
            if (gold_heads.contains(predicted_men.headWord))
                continue;
            dataset.add(new BasicDatum<>(predicted_men.getSingletonFeatures(dict), "0"));
        }
    }
    dataset.summaryStatistics();
    return dataset;
}
Also used : Dictionaries(edu.stanford.nlp.coref.data.Dictionaries) GeneralDataset(edu.stanford.nlp.classify.GeneralDataset) Dataset(edu.stanford.nlp.classify.Dataset) ArrayList(java.util.ArrayList) Document(edu.stanford.nlp.coref.data.Document) CoreLabel(edu.stanford.nlp.ling.CoreLabel) DocumentMaker(edu.stanford.nlp.coref.data.DocumentMaker) CorefCluster(edu.stanford.nlp.coref.data.CorefCluster) Mention(edu.stanford.nlp.coref.data.Mention) SemanticGraph(edu.stanford.nlp.semgraph.SemanticGraph) IndexedWord(edu.stanford.nlp.ling.IndexedWord)

Example 7 with Document

use of edu.stanford.nlp.coref.data.Document in project CoreNLP by stanfordnlp.

the class FeatureExtractor method getFeatures.

private Counter<String> getFeatures(Document doc, Mention m, Map<Integer, List<Mention>> mentionsByHeadIndex) {
    Counter<String> features = new ClassicCounter<>();
    // type features
    features.incrementCount("mention-type=" + m.mentionType);
    features.incrementCount("gender=" + m.gender);
    features.incrementCount("person-fine=" + m.person);
    features.incrementCount("head-ne-type=" + m.nerString);
    List<String> singletonFeatures = m.getSingletonFeatures(dictionaries);
    for (Map.Entry<Integer, String> e : SINGLETON_FEATURES.entrySet()) {
        if (e.getKey() < singletonFeatures.size()) {
            features.incrementCount(e.getValue() + "=" + singletonFeatures.get(e.getKey()));
        }
    }
    // length and location features
    addNumeric(features, "mention-length", m.spanToString().length());
    addNumeric(features, "mention-words", m.originalSpan.size());
    addNumeric(features, "sentence-words", m.sentenceWords.size());
    features.incrementCount("sentence-words=" + bin(m.sentenceWords.size()));
    features.incrementCount("mention-position", m.mentionNum / (double) doc.predictedMentions.size());
    features.incrementCount("sentence-position", m.sentNum / (double) doc.numSentences);
    // lexical features
    CoreLabel firstWord = firstWord(m);
    CoreLabel lastWord = lastWord(m);
    CoreLabel headWord = headWord(m);
    CoreLabel prevWord = prevWord(m);
    CoreLabel nextWord = nextWord(m);
    CoreLabel prevprevWord = prevprevWord(m);
    CoreLabel nextnextWord = nextnextWord(m);
    String headPOS = getPOS(headWord);
    String firstPOS = getPOS(firstWord);
    String lastPOS = getPOS(lastWord);
    String prevPOS = getPOS(prevWord);
    String nextPOS = getPOS(nextWord);
    String prevprevPOS = getPOS(prevprevWord);
    String nextnextPOS = getPOS(nextnextWord);
    features.incrementCount("first-word=" + wordIndicator(firstWord, firstPOS));
    features.incrementCount("last-word=" + wordIndicator(lastWord, lastPOS));
    features.incrementCount("head-word=" + wordIndicator(headWord, headPOS));
    features.incrementCount("next-word=" + wordIndicator(nextWord, nextPOS));
    features.incrementCount("prev-word=" + wordIndicator(prevWord, prevPOS));
    features.incrementCount("next-bigram=" + wordIndicator(nextWord, nextnextWord, nextPOS + "_" + nextnextPOS));
    features.incrementCount("prev-bigram=" + wordIndicator(prevprevWord, prevWord, prevprevPOS + "_" + prevPOS));
    features.incrementCount("next-pos=" + nextPOS);
    features.incrementCount("prev-pos=" + prevPOS);
    features.incrementCount("first-pos=" + firstPOS);
    features.incrementCount("last-pos=" + lastPOS);
    features.incrementCount("next-pos-bigram=" + nextPOS + "_" + nextnextPOS);
    features.incrementCount("prev-pos-bigram=" + prevprevPOS + "_" + prevPOS);
    addDependencyFeatures(features, "parent", getDependencyParent(m), true);
    addFeature(features, "ends-with-head", m.headIndex == m.endIndex - 1);
    addFeature(features, "is-generic", m.originalSpan.size() == 1 && firstPOS.equals("NNS"));
    // syntax features
    IndexedWord w = m.headIndexedWord;
    String depPath = "";
    int depth = 0;
    while (w != null) {
        SemanticGraphEdge e = getDependencyParent(m, w);
        depth++;
        if (depth <= 3 && e != null) {
            depPath += (depPath.isEmpty() ? "" : "_") + e.getRelation().toString();
            features.incrementCount("dep-path=" + depPath);
            w = e.getSource();
        } else {
            w = null;
        }
    }
    if (useConstituencyParse) {
        int fullEmbeddingLevel = headEmbeddingLevel(m.contextParseTree, m.headIndex);
        int mentionEmbeddingLevel = headEmbeddingLevel(m.mentionSubTree, m.headIndex - m.startIndex);
        if (fullEmbeddingLevel != -1 && mentionEmbeddingLevel != -1) {
            features.incrementCount("mention-embedding-level=" + bin(fullEmbeddingLevel - mentionEmbeddingLevel));
            features.incrementCount("head-embedding-level=" + bin(mentionEmbeddingLevel));
        } else {
            features.incrementCount("undetermined-embedding-level");
        }
        features.incrementCount("num-embedded-nps=" + bin(numEmbeddedNps(m.mentionSubTree)));
        String syntaxPath = "";
        Tree tree = m.contextParseTree;
        Tree head = tree.getLeaves().get(m.headIndex).ancestor(1, tree);
        depth = 0;
        for (Tree node : tree.pathNodeToNode(head, tree)) {
            syntaxPath += node.value() + "-";
            features.incrementCount("syntax-path=" + syntaxPath);
            depth++;
            if (depth >= 4 || node.value().equals("S")) {
                break;
            }
        }
    }
    // mention containment features
    addFeature(features, "contained-in-other-mention", mentionsByHeadIndex.get(m.headIndex).stream().anyMatch(m2 -> m != m2 && m.insideIn(m2)));
    addFeature(features, "contains-other-mention", mentionsByHeadIndex.get(m.headIndex).stream().anyMatch(m2 -> m != m2 && m2.insideIn(m)));
    // features from dcoref rules
    addFeature(features, "bare-plural", m.originalSpan.size() == 1 && headPOS.equals("NNS"));
    addFeature(features, "quantifier-start", dictionaries.quantifiers.contains(firstWord.word().toLowerCase()));
    addFeature(features, "negative-start", firstWord.word().toLowerCase().matches("none|no|nothing|not"));
    addFeature(features, "partitive", RuleBasedCorefMentionFinder.partitiveRule(m, m.sentenceWords, dictionaries));
    addFeature(features, "adjectival-demonym", dictionaries.isAdjectivalDemonym(m.spanToString()));
    if (doc.docType != DocType.ARTICLE && m.person == Person.YOU && nextWord != null && nextWord.word().equalsIgnoreCase("know")) {
        features.incrementCount("generic-you");
    }
    return features;
}
Also used : SpeakerAnnotation(edu.stanford.nlp.ling.CoreAnnotations.SpeakerAnnotation) Tree(edu.stanford.nlp.trees.Tree) HashMap(java.util.HashMap) Random(java.util.Random) Dictionaries(edu.stanford.nlp.coref.data.Dictionaries) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Number(edu.stanford.nlp.coref.data.Dictionaries.Number) CorefCluster(edu.stanford.nlp.coref.data.CorefCluster) Mention(edu.stanford.nlp.coref.data.Mention) RuleBasedCorefMentionFinder(edu.stanford.nlp.coref.md.RuleBasedCorefMentionFinder) Counter(edu.stanford.nlp.stats.Counter) Map(java.util.Map) Pair(edu.stanford.nlp.util.Pair) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) CorefRules(edu.stanford.nlp.coref.CorefRules) IndexedWord(edu.stanford.nlp.ling.IndexedWord) CoreLabel(edu.stanford.nlp.ling.CoreLabel) Properties(java.util.Properties) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) SemanticGraphEdge(edu.stanford.nlp.semgraph.SemanticGraphEdge) Iterator(java.util.Iterator) IOUtils(edu.stanford.nlp.io.IOUtils) DocType(edu.stanford.nlp.coref.data.Document.DocType) Set(java.util.Set) Person(edu.stanford.nlp.coref.data.Dictionaries.Person) List(java.util.List) MentionType(edu.stanford.nlp.coref.data.Dictionaries.MentionType) StringUtils(edu.stanford.nlp.util.StringUtils) CorefProperties(edu.stanford.nlp.coref.CorefProperties) Document(edu.stanford.nlp.coref.data.Document) CorefUtils(edu.stanford.nlp.coref.CorefUtils) SemanticGraphEdge(edu.stanford.nlp.semgraph.SemanticGraphEdge) CoreLabel(edu.stanford.nlp.ling.CoreLabel) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) Tree(edu.stanford.nlp.trees.Tree) IndexedWord(edu.stanford.nlp.ling.IndexedWord) HashMap(java.util.HashMap) Map(java.util.Map)

Example 8 with Document

use of edu.stanford.nlp.coref.data.Document in project CoreNLP by stanfordnlp.

the class HybridCorefPrinter method linkDistanceAnalysis.

public static void linkDistanceAnalysis(String[] args) throws Exception {
    Properties props = StringUtils.argsToProperties(args);
    HybridCorefSystem cs = new HybridCorefSystem(props);
    cs.docMaker.resetDocs();
    Counter<Integer> proper = new ClassicCounter<>();
    Counter<Integer> common = new ClassicCounter<>();
    Counter<Integer> pronoun = new ClassicCounter<>();
    Counter<Integer> list = new ClassicCounter<>();
    while (true) {
        Document document = cs.docMaker.nextDoc();
        if (document == null)
            break;
        for (int sentIdx = 0; sentIdx < document.predictedMentions.size(); sentIdx++) {
            List<Mention> predictedInSent = document.predictedMentions.get(sentIdx);
            for (int mIdx = 0; mIdx < predictedInSent.size(); mIdx++) {
                Mention m = predictedInSent.get(mIdx);
                loop: for (int distance = 0; distance <= sentIdx; distance++) {
                    List<Mention> candidates = Sieve.getOrderedAntecedents(m, sentIdx - distance, mIdx, document.predictedMentions, cs.dictionaries);
                    for (Mention candidate : candidates) {
                        if (candidate == m)
                            continue;
                        // ignore cataphora
                        if (distance == 0 && m.appearEarlierThan(candidate))
                            continue;
                        if (candidate.goldCorefClusterID == m.goldCorefClusterID) {
                            switch(m.mentionType) {
                                case NOMINAL:
                                    if (candidate.mentionType == MentionType.NOMINAL || candidate.mentionType == MentionType.PROPER) {
                                        common.incrementCount(distance);
                                        break loop;
                                    }
                                    break;
                                case PROPER:
                                    if (candidate.mentionType == MentionType.PROPER) {
                                        proper.incrementCount(distance);
                                        break loop;
                                    }
                                    break;
                                case PRONOMINAL:
                                    pronoun.incrementCount(distance);
                                    break loop;
                                case LIST:
                                    if (candidate.mentionType == MentionType.LIST) {
                                        list.incrementCount(distance);
                                        break loop;
                                    }
                                    break;
                                default:
                                    break;
                            }
                        }
                    }
                }
            }
        }
    }
    System.out.println("PROPER -------------------------------------------");
    Counters.printCounterSortedByKeys(proper);
    System.out.println("COMMON -------------------------------------------");
    Counters.printCounterSortedByKeys(common);
    System.out.println("PRONOUN -------------------------------------------");
    Counters.printCounterSortedByKeys(pronoun);
    System.out.println("LIST -------------------------------------------");
    Counters.printCounterSortedByKeys(list);
    log.info();
}
Also used : Mention(edu.stanford.nlp.coref.data.Mention) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) ArrayList(java.util.ArrayList) List(java.util.List) Properties(java.util.Properties) Document(edu.stanford.nlp.coref.data.Document)

Aggregations

Document (edu.stanford.nlp.coref.data.Document)8 CorefCluster (edu.stanford.nlp.coref.data.CorefCluster)3 Mention (edu.stanford.nlp.coref.data.Mention)3 ArrayList (java.util.ArrayList)3 CorefChain (edu.stanford.nlp.coref.data.CorefChain)2 Dictionaries (edu.stanford.nlp.coref.data.Dictionaries)2 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)2 CoreLabel (edu.stanford.nlp.ling.CoreLabel)2 IndexedWord (edu.stanford.nlp.ling.IndexedWord)2 ClassicCounter (edu.stanford.nlp.stats.ClassicCounter)2 Pair (edu.stanford.nlp.util.Pair)2 FileOutputStream (java.io.FileOutputStream)2 PrintWriter (java.io.PrintWriter)2 Logger (java.util.logging.Logger)2 Dataset (edu.stanford.nlp.classify.Dataset)1 GeneralDataset (edu.stanford.nlp.classify.GeneralDataset)1 CorefCoreAnnotations (edu.stanford.nlp.coref.CorefCoreAnnotations)1 CorefProperties (edu.stanford.nlp.coref.CorefProperties)1 CorefRules (edu.stanford.nlp.coref.CorefRules)1 CorefUtils (edu.stanford.nlp.coref.CorefUtils)1