Search in sources :

Example 1 with GeneralDataset

use of edu.stanford.nlp.classify.GeneralDataset in project CoreNLP by stanfordnlp.

the class SingletonPredictor method generateFeatureVectors.

/**
 * Generate the training features from the CoNLL input file.
 * @return Dataset of feature vectors
 * @throws Exception
 */
private static GeneralDataset<String, String> generateFeatureVectors(Properties props) throws Exception {
    GeneralDataset<String, String> dataset = new Dataset<>();
    Dictionaries dict = new Dictionaries(props);
    MentionExtractor mentionExtractor = new CoNLLMentionExtractor(dict, props, new Semantics(dict));
    Document document;
    while ((document = mentionExtractor.nextDoc()) != null) {
        setTokenIndices(document);
        document.extractGoldCorefClusters();
        Map<Integer, CorefCluster> entities = document.goldCorefClusters;
        // Generate features for coreferent mentions with class label 1
        for (CorefCluster entity : entities.values()) {
            for (Mention mention : entity.getCorefMentions()) {
                // Ignore verbal mentions
                if (mention.headWord.tag().startsWith("V"))
                    continue;
                IndexedWord head = mention.dependency.getNodeByIndexSafe(mention.headWord.index());
                if (head == null)
                    continue;
                ArrayList<String> feats = mention.getSingletonFeatures(dict);
                dataset.add(new BasicDatum<>(feats, "1"));
            }
        }
        // Generate features for singletons with class label 0
        ArrayList<CoreLabel> gold_heads = new ArrayList<>();
        for (Mention gold_men : document.allGoldMentions.values()) {
            gold_heads.add(gold_men.headWord);
        }
        for (Mention predicted_men : document.allPredictedMentions.values()) {
            SemanticGraph dep = predicted_men.dependency;
            IndexedWord head = dep.getNodeByIndexSafe(predicted_men.headWord.index());
            if (head == null)
                continue;
            // Ignore verbal mentions
            if (predicted_men.headWord.tag().startsWith("V"))
                continue;
            // If the mention is in the gold set, it is not a singleton and thus ignore
            if (gold_heads.contains(predicted_men.headWord))
                continue;
            dataset.add(new BasicDatum<>(predicted_men.getSingletonFeatures(dict), "0"));
        }
    }
    dataset.summaryStatistics();
    return dataset;
}
Also used : Dataset(edu.stanford.nlp.classify.Dataset) GeneralDataset(edu.stanford.nlp.classify.GeneralDataset) ArrayList(java.util.ArrayList) CoreLabel(edu.stanford.nlp.ling.CoreLabel) SemanticGraph(edu.stanford.nlp.semgraph.SemanticGraph) IndexedWord(edu.stanford.nlp.ling.IndexedWord)

Example 2 with GeneralDataset

use of edu.stanford.nlp.classify.GeneralDataset in project CoreNLP by stanfordnlp.

the class SingletonPredictor method generateFeatureVectors.

/**
 * Generate the training features from the CoNLL input file.
 * @return Dataset of feature vectors
 * @throws Exception
 */
private static GeneralDataset<String, String> generateFeatureVectors(Properties props) throws Exception {
    GeneralDataset<String, String> dataset = new Dataset<>();
    Dictionaries dict = new Dictionaries(props);
    DocumentMaker docMaker = new DocumentMaker(props, dict);
    Document document;
    while ((document = docMaker.nextDoc()) != null) {
        setTokenIndices(document);
        Map<Integer, CorefCluster> entities = document.goldCorefClusters;
        // Generate features for coreferent mentions with class label 1
        for (CorefCluster entity : entities.values()) {
            for (Mention mention : entity.getCorefMentions()) {
                // Ignore verbal mentions
                if (mention.headWord.tag().startsWith("V"))
                    continue;
                IndexedWord head = mention.enhancedDependency.getNodeByIndexSafe(mention.headWord.index());
                if (head == null)
                    continue;
                ArrayList<String> feats = mention.getSingletonFeatures(dict);
                dataset.add(new BasicDatum<>(feats, "1"));
            }
        }
        // Generate features for singletons with class label 0
        ArrayList<CoreLabel> gold_heads = new ArrayList<>();
        for (Mention gold_men : document.goldMentionsByID.values()) {
            gold_heads.add(gold_men.headWord);
        }
        for (Mention predicted_men : document.predictedMentionsByID.values()) {
            SemanticGraph dep = predicted_men.enhancedDependency;
            IndexedWord head = dep.getNodeByIndexSafe(predicted_men.headWord.index());
            if (head == null || !dep.vertexSet().contains(head))
                continue;
            // Ignore verbal mentions
            if (predicted_men.headWord.tag().startsWith("V"))
                continue;
            // If the mention is in the gold set, it is not a singleton and thus ignore
            if (gold_heads.contains(predicted_men.headWord))
                continue;
            dataset.add(new BasicDatum<>(predicted_men.getSingletonFeatures(dict), "0"));
        }
    }
    dataset.summaryStatistics();
    return dataset;
}
Also used : Dictionaries(edu.stanford.nlp.coref.data.Dictionaries) GeneralDataset(edu.stanford.nlp.classify.GeneralDataset) Dataset(edu.stanford.nlp.classify.Dataset) ArrayList(java.util.ArrayList) Document(edu.stanford.nlp.coref.data.Document) CoreLabel(edu.stanford.nlp.ling.CoreLabel) DocumentMaker(edu.stanford.nlp.coref.data.DocumentMaker) CorefCluster(edu.stanford.nlp.coref.data.CorefCluster) Mention(edu.stanford.nlp.coref.data.Mention) SemanticGraph(edu.stanford.nlp.semgraph.SemanticGraph) IndexedWord(edu.stanford.nlp.ling.IndexedWord)

Aggregations

Dataset (edu.stanford.nlp.classify.Dataset)2 GeneralDataset (edu.stanford.nlp.classify.GeneralDataset)2 CoreLabel (edu.stanford.nlp.ling.CoreLabel)2 IndexedWord (edu.stanford.nlp.ling.IndexedWord)2 SemanticGraph (edu.stanford.nlp.semgraph.SemanticGraph)2 ArrayList (java.util.ArrayList)2 CorefCluster (edu.stanford.nlp.coref.data.CorefCluster)1 Dictionaries (edu.stanford.nlp.coref.data.Dictionaries)1 Document (edu.stanford.nlp.coref.data.Document)1 DocumentMaker (edu.stanford.nlp.coref.data.DocumentMaker)1 Mention (edu.stanford.nlp.coref.data.Mention)1