Examples with Dictionaries - edu.stanford.nlp.coref.data.Dictionaries

Example 1 with Dictionaries

use of edu.stanford.nlp.coref.data.Dictionaries in project CoreNLP by stanfordnlp.

the class FastNeuralCorefDataExporter method main.

public static void main(String[] args) throws Exception {
    Properties props = StringUtils.argsToProperties(args);
    props.setProperty("coref.maxMentionDistance", "50");
    props.setProperty("coref.maxMentionDistanceWithStringMatch", "1000");
    props.setProperty("coref.conllOutputPath", "/Users/kevinclark/Programming/research/coref/conll-2012/output");
    props.setProperty("coref.data", "/Users/kevinclark/Programming/research/coref/conll-2012");
    props.setProperty("coref.scorer", "/Users/kevinclark/Programming/research/coref/conll-2012/scorer/v8.01/scorer.pl");
    Dictionaries dictionaries = new Dictionaries(props);
    String outputPath = "/Users/kevinclark/Programming/research/coref/data";
    String dataPath = outputPath + "/raw/";
    String goldClusterPath = outputPath + "/gold/";
    String compressorPath = outputPath + "/";
    IOUtils.ensureDir(new File(outputPath));
    IOUtils.ensureDir(new File(dataPath));
    IOUtils.ensureDir(new File(goldClusterPath));
    IOUtils.ensureDir(new File(compressorPath));
    Compressor<String> compressor = new Compressor<String>();
    for (Dataset dataset : Arrays.asList(Dataset.TRAIN, Dataset.DEV, Dataset.TEST)) {
        CorefProperties.setInput(props, dataset);
        System.out.println(CorefProperties.getInputPath(props));
        new FastNeuralCorefDataExporter(props, dictionaries, compressor, dataPath + dataset.toString().toLowerCase(), goldClusterPath + dataset.toString().toLowerCase()).run(props, dictionaries);
    }
    writeCompressor(compressor, compressorPath + "/compression");
}

Also used : Dictionaries(edu.stanford.nlp.coref.data.Dictionaries) Dataset(edu.stanford.nlp.coref.CorefProperties.Dataset) Compressor(edu.stanford.nlp.coref.statistical.Compressor) Properties(java.util.Properties) StatisticalCorefProperties(edu.stanford.nlp.coref.statistical.StatisticalCorefProperties) CorefProperties(edu.stanford.nlp.coref.CorefProperties) File(java.io.File)

Example 2 with Dictionaries

use of edu.stanford.nlp.coref.data.Dictionaries in project CoreNLP by stanfordnlp.

the class StatisticalCorefTrainer method doTraining.

public static void doTraining(Properties props) throws Exception {
    setTrainingPath(props);
    Dictionaries dictionaries = new Dictionaries(props);
    setDataPath("train");
    wordCountsFile = trainingPath + "train/word_counts.ser";
    CorefProperties.setInput(props, Dataset.TRAIN);
    preprocess(props, dictionaries, true);
    setDataPath("dev");
    CorefProperties.setInput(props, Dataset.DEV);
    preprocess(props, dictionaries, false);
    setDataPath("train");
    dictionaries = null;
    PairwiseModel classificationModel = PairwiseModel.newBuilder(CLASSIFICATION_MODEL, MetaFeatureExtractor.newBuilder().build()).build();
    PairwiseModel rankingModel = PairwiseModel.newBuilder(RANKING_MODEL, MetaFeatureExtractor.newBuilder().build()).build();
    PairwiseModel anaphoricityModel = PairwiseModel.newBuilder(ANAPHORICITY_MODEL, MetaFeatureExtractor.anaphoricityMFE()).trainingExamples(5000000).build();
    PairwiseModelTrainer.trainRanking(rankingModel);
    PairwiseModelTrainer.trainClassification(classificationModel, false);
    PairwiseModelTrainer.trainClassification(anaphoricityModel, true);
    setDataPath("dev");
    PairwiseModelTrainer.test(classificationModel, predictionsName, false);
    PairwiseModelTrainer.test(rankingModel, predictionsName, false);
    PairwiseModelTrainer.test(anaphoricityModel, predictionsName, true);
    new Clusterer().doTraining(CLUSTERING_MODEL_NAME);
}

Also used : Dictionaries(edu.stanford.nlp.coref.data.Dictionaries)

Example 3 with Dictionaries

use of edu.stanford.nlp.coref.data.Dictionaries in project CoreNLP by stanfordnlp.

the class FeatureExtractor method getFeatures.

private Counter<String> getFeatures(Document doc, Mention m, Map<Integer, List<Mention>> mentionsByHeadIndex) {
    Counter<String> features = new ClassicCounter<>();
    // type features
    features.incrementCount("mention-type=" + m.mentionType);
    features.incrementCount("gender=" + m.gender);
    features.incrementCount("person-fine=" + m.person);
    features.incrementCount("head-ne-type=" + m.nerString);
    List<String> singletonFeatures = m.getSingletonFeatures(dictionaries);
    for (Map.Entry<Integer, String> e : SINGLETON_FEATURES.entrySet()) {
        if (e.getKey() < singletonFeatures.size()) {
            features.incrementCount(e.getValue() + "=" + singletonFeatures.get(e.getKey()));
        }
    }
    // length and location features
    addNumeric(features, "mention-length", m.spanToString().length());
    addNumeric(features, "mention-words", m.originalSpan.size());
    addNumeric(features, "sentence-words", m.sentenceWords.size());
    features.incrementCount("sentence-words=" + bin(m.sentenceWords.size()));
    features.incrementCount("mention-position", m.mentionNum / (double) doc.predictedMentions.size());
    features.incrementCount("sentence-position", m.sentNum / (double) doc.numSentences);
    // lexical features
    CoreLabel firstWord = firstWord(m);
    CoreLabel lastWord = lastWord(m);
    CoreLabel headWord = headWord(m);
    CoreLabel prevWord = prevWord(m);
    CoreLabel nextWord = nextWord(m);
    CoreLabel prevprevWord = prevprevWord(m);
    CoreLabel nextnextWord = nextnextWord(m);
    String headPOS = getPOS(headWord);
    String firstPOS = getPOS(firstWord);
    String lastPOS = getPOS(lastWord);
    String prevPOS = getPOS(prevWord);
    String nextPOS = getPOS(nextWord);
    String prevprevPOS = getPOS(prevprevWord);
    String nextnextPOS = getPOS(nextnextWord);
    features.incrementCount("first-word=" + wordIndicator(firstWord, firstPOS));
    features.incrementCount("last-word=" + wordIndicator(lastWord, lastPOS));
    features.incrementCount("head-word=" + wordIndicator(headWord, headPOS));
    features.incrementCount("next-word=" + wordIndicator(nextWord, nextPOS));
    features.incrementCount("prev-word=" + wordIndicator(prevWord, prevPOS));
    features.incrementCount("next-bigram=" + wordIndicator(nextWord, nextnextWord, nextPOS + "_" + nextnextPOS));
    features.incrementCount("prev-bigram=" + wordIndicator(prevprevWord, prevWord, prevprevPOS + "_" + prevPOS));
    features.incrementCount("next-pos=" + nextPOS);
    features.incrementCount("prev-pos=" + prevPOS);
    features.incrementCount("first-pos=" + firstPOS);
    features.incrementCount("last-pos=" + lastPOS);
    features.incrementCount("next-pos-bigram=" + nextPOS + "_" + nextnextPOS);
    features.incrementCount("prev-pos-bigram=" + prevprevPOS + "_" + prevPOS);
    addDependencyFeatures(features, "parent", getDependencyParent(m), true);
    addFeature(features, "ends-with-head", m.headIndex == m.endIndex - 1);
    addFeature(features, "is-generic", m.originalSpan.size() == 1 && firstPOS.equals("NNS"));
    // syntax features
    IndexedWord w = m.headIndexedWord;
    String depPath = "";
    int depth = 0;
    while (w != null) {
        SemanticGraphEdge e = getDependencyParent(m, w);
        depth++;
        if (depth <= 3 && e != null) {
            depPath += (depPath.isEmpty() ? "" : "_") + e.getRelation().toString();
            features.incrementCount("dep-path=" + depPath);
            w = e.getSource();
        } else {
            w = null;
        }
    }
    if (useConstituencyParse) {
        int fullEmbeddingLevel = headEmbeddingLevel(m.contextParseTree, m.headIndex);
        int mentionEmbeddingLevel = headEmbeddingLevel(m.mentionSubTree, m.headIndex - m.startIndex);
        if (fullEmbeddingLevel != -1 && mentionEmbeddingLevel != -1) {
            features.incrementCount("mention-embedding-level=" + bin(fullEmbeddingLevel - mentionEmbeddingLevel));
            features.incrementCount("head-embedding-level=" + bin(mentionEmbeddingLevel));
        } else {
            features.incrementCount("undetermined-embedding-level");
        }
        features.incrementCount("num-embedded-nps=" + bin(numEmbeddedNps(m.mentionSubTree)));
        String syntaxPath = "";
        Tree tree = m.contextParseTree;
        Tree head = tree.getLeaves().get(m.headIndex).ancestor(1, tree);
        depth = 0;
        for (Tree node : tree.pathNodeToNode(head, tree)) {
            syntaxPath += node.value() + "-";
            features.incrementCount("syntax-path=" + syntaxPath);
            depth++;
            if (depth >= 4 || node.value().equals("S")) {
                break;
            }
        }
    }
    // mention containment features
    addFeature(features, "contained-in-other-mention", mentionsByHeadIndex.get(m.headIndex).stream().anyMatch(m2 -> m != m2 && m.insideIn(m2)));
    addFeature(features, "contains-other-mention", mentionsByHeadIndex.get(m.headIndex).stream().anyMatch(m2 -> m != m2 && m2.insideIn(m)));
    // features from dcoref rules
    addFeature(features, "bare-plural", m.originalSpan.size() == 1 && headPOS.equals("NNS"));
    addFeature(features, "quantifier-start", dictionaries.quantifiers.contains(firstWord.word().toLowerCase()));
    addFeature(features, "negative-start", firstWord.word().toLowerCase().matches("none|no|nothing|not"));
    addFeature(features, "partitive", RuleBasedCorefMentionFinder.partitiveRule(m, m.sentenceWords, dictionaries));
    addFeature(features, "adjectival-demonym", dictionaries.isAdjectivalDemonym(m.spanToString()));
    if (doc.docType != DocType.ARTICLE && m.person == Person.YOU && nextWord != null && nextWord.word().equalsIgnoreCase("know")) {
        features.incrementCount("generic-you");
    }
    return features;
}

Also used : SpeakerAnnotation(edu.stanford.nlp.ling.CoreAnnotations.SpeakerAnnotation) Tree(edu.stanford.nlp.trees.Tree) HashMap(java.util.HashMap) Random(java.util.Random) Dictionaries(edu.stanford.nlp.coref.data.Dictionaries) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Number(edu.stanford.nlp.coref.data.Dictionaries.Number) CorefCluster(edu.stanford.nlp.coref.data.CorefCluster) Mention(edu.stanford.nlp.coref.data.Mention) RuleBasedCorefMentionFinder(edu.stanford.nlp.coref.md.RuleBasedCorefMentionFinder) Counter(edu.stanford.nlp.stats.Counter) Map(java.util.Map) Pair(edu.stanford.nlp.util.Pair) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) CorefRules(edu.stanford.nlp.coref.CorefRules) IndexedWord(edu.stanford.nlp.ling.IndexedWord) CoreLabel(edu.stanford.nlp.ling.CoreLabel) Properties(java.util.Properties) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) SemanticGraphEdge(edu.stanford.nlp.semgraph.SemanticGraphEdge) Iterator(java.util.Iterator) IOUtils(edu.stanford.nlp.io.IOUtils) DocType(edu.stanford.nlp.coref.data.Document.DocType) Set(java.util.Set) Person(edu.stanford.nlp.coref.data.Dictionaries.Person) List(java.util.List) MentionType(edu.stanford.nlp.coref.data.Dictionaries.MentionType) StringUtils(edu.stanford.nlp.util.StringUtils) CorefProperties(edu.stanford.nlp.coref.CorefProperties) Document(edu.stanford.nlp.coref.data.Document) CorefUtils(edu.stanford.nlp.coref.CorefUtils) SemanticGraphEdge(edu.stanford.nlp.semgraph.SemanticGraphEdge) CoreLabel(edu.stanford.nlp.ling.CoreLabel) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) Tree(edu.stanford.nlp.trees.Tree) IndexedWord(edu.stanford.nlp.ling.IndexedWord) HashMap(java.util.HashMap) Map(java.util.Map)

Example 4 with Dictionaries

use of edu.stanford.nlp.coref.data.Dictionaries in project CoreNLP by stanfordnlp.

the class SingletonPredictor method generateFeatureVectors.

/**
 * Generate the training features from the CoNLL input file.
 * @return Dataset of feature vectors
 * @throws Exception
 */
private static GeneralDataset<String, String> generateFeatureVectors(Properties props) throws Exception {
    GeneralDataset<String, String> dataset = new Dataset<>();
    Dictionaries dict = new Dictionaries(props);
    DocumentMaker docMaker = new DocumentMaker(props, dict);
    Document document;
    while ((document = docMaker.nextDoc()) != null) {
        setTokenIndices(document);
        Map<Integer, CorefCluster> entities = document.goldCorefClusters;
        // Generate features for coreferent mentions with class label 1
        for (CorefCluster entity : entities.values()) {
            for (Mention mention : entity.getCorefMentions()) {
                // Ignore verbal mentions
                if (mention.headWord.tag().startsWith("V"))
                    continue;
                IndexedWord head = mention.enhancedDependency.getNodeByIndexSafe(mention.headWord.index());
                if (head == null)
                    continue;
                ArrayList<String> feats = mention.getSingletonFeatures(dict);
                dataset.add(new BasicDatum<>(feats, "1"));
            }
        }
        // Generate features for singletons with class label 0
        ArrayList<CoreLabel> gold_heads = new ArrayList<>();
        for (Mention gold_men : document.goldMentionsByID.values()) {
            gold_heads.add(gold_men.headWord);
        }
        for (Mention predicted_men : document.predictedMentionsByID.values()) {
            SemanticGraph dep = predicted_men.enhancedDependency;
            IndexedWord head = dep.getNodeByIndexSafe(predicted_men.headWord.index());
            if (head == null || !dep.vertexSet().contains(head))
                continue;
            // Ignore verbal mentions
            if (predicted_men.headWord.tag().startsWith("V"))
                continue;
            // If the mention is in the gold set, it is not a singleton and thus ignore
            if (gold_heads.contains(predicted_men.headWord))
                continue;
            dataset.add(new BasicDatum<>(predicted_men.getSingletonFeatures(dict), "0"));
        }
    }
    dataset.summaryStatistics();
    return dataset;
}

Also used : Dictionaries(edu.stanford.nlp.coref.data.Dictionaries) GeneralDataset(edu.stanford.nlp.classify.GeneralDataset) Dataset(edu.stanford.nlp.classify.Dataset) ArrayList(java.util.ArrayList) Document(edu.stanford.nlp.coref.data.Document) CoreLabel(edu.stanford.nlp.ling.CoreLabel) DocumentMaker(edu.stanford.nlp.coref.data.DocumentMaker) CorefCluster(edu.stanford.nlp.coref.data.CorefCluster) Mention(edu.stanford.nlp.coref.data.Mention) SemanticGraph(edu.stanford.nlp.semgraph.SemanticGraph) IndexedWord(edu.stanford.nlp.ling.IndexedWord)

Example 5 with Dictionaries

use of edu.stanford.nlp.coref.data.Dictionaries in project CoreNLP by stanfordnlp.

the class MentionDetectionEvaluator method main.

public static void main(String[] args) throws Exception {
    Properties props = StringUtils.argsToProperties(new String[] { "-props", args[0] });
    Dictionaries dictionaries = new Dictionaries(props);
    CorefProperties.setInput(props, Dataset.TRAIN);
    new MentionDetectionEvaluator().run(props, dictionaries);
}

Also used : Dictionaries(edu.stanford.nlp.coref.data.Dictionaries) Properties(java.util.Properties) CorefProperties(edu.stanford.nlp.coref.CorefProperties)

Aggregations

Dictionaries (edu.stanford.nlp.coref.data.Dictionaries)7 Properties (java.util.Properties)5 CorefProperties (edu.stanford.nlp.coref.CorefProperties)4 CorefCluster (edu.stanford.nlp.coref.data.CorefCluster)2 Document (edu.stanford.nlp.coref.data.Document)2 DocumentMaker (edu.stanford.nlp.coref.data.DocumentMaker)2 Mention (edu.stanford.nlp.coref.data.Mention)2 CoreLabel (edu.stanford.nlp.ling.CoreLabel)2 IndexedWord (edu.stanford.nlp.ling.IndexedWord)2 ArrayList (java.util.ArrayList)2 Dataset (edu.stanford.nlp.classify.Dataset)1 GeneralDataset (edu.stanford.nlp.classify.GeneralDataset)1 Dataset (edu.stanford.nlp.coref.CorefProperties.Dataset)1 CorefRules (edu.stanford.nlp.coref.CorefRules)1 CorefSystem (edu.stanford.nlp.coref.CorefSystem)1 CorefUtils (edu.stanford.nlp.coref.CorefUtils)1 MentionType (edu.stanford.nlp.coref.data.Dictionaries.MentionType)1 Number (edu.stanford.nlp.coref.data.Dictionaries.Number)1 Person (edu.stanford.nlp.coref.data.Dictionaries.Person)1 DocType (edu.stanford.nlp.coref.data.Document.DocType)1