use of edu.stanford.nlp.coref.data.Document in project CoreNLP by stanfordnlp.
the class SingletonPredictor method generateFeatureVectors.
/**
* Generate the training features from the CoNLL input file.
* @return Dataset of feature vectors
* @throws Exception
*/
private static GeneralDataset<String, String> generateFeatureVectors(Properties props) throws Exception {
GeneralDataset<String, String> dataset = new Dataset<>();
Dictionaries dict = new Dictionaries(props);
DocumentMaker docMaker = new DocumentMaker(props, dict);
Document document;
while ((document = docMaker.nextDoc()) != null) {
setTokenIndices(document);
Map<Integer, CorefCluster> entities = document.goldCorefClusters;
// Generate features for coreferent mentions with class label 1
for (CorefCluster entity : entities.values()) {
for (Mention mention : entity.getCorefMentions()) {
// Ignore verbal mentions
if (mention.headWord.tag().startsWith("V"))
continue;
IndexedWord head = mention.enhancedDependency.getNodeByIndexSafe(mention.headWord.index());
if (head == null)
continue;
ArrayList<String> feats = mention.getSingletonFeatures(dict);
dataset.add(new BasicDatum<>(feats, "1"));
}
}
// Generate features for singletons with class label 0
ArrayList<CoreLabel> gold_heads = new ArrayList<>();
for (Mention gold_men : document.goldMentionsByID.values()) {
gold_heads.add(gold_men.headWord);
}
for (Mention predicted_men : document.predictedMentionsByID.values()) {
SemanticGraph dep = predicted_men.enhancedDependency;
IndexedWord head = dep.getNodeByIndexSafe(predicted_men.headWord.index());
if (head == null || !dep.vertexSet().contains(head))
continue;
// Ignore verbal mentions
if (predicted_men.headWord.tag().startsWith("V"))
continue;
// If the mention is in the gold set, it is not a singleton and thus ignore
if (gold_heads.contains(predicted_men.headWord))
continue;
dataset.add(new BasicDatum<>(predicted_men.getSingletonFeatures(dict), "0"));
}
}
dataset.summaryStatistics();
return dataset;
}
use of edu.stanford.nlp.coref.data.Document in project CoreNLP by stanfordnlp.
the class FeatureExtractor method getFeatures.
private Counter<String> getFeatures(Document doc, Mention m, Map<Integer, List<Mention>> mentionsByHeadIndex) {
Counter<String> features = new ClassicCounter<>();
// type features
features.incrementCount("mention-type=" + m.mentionType);
features.incrementCount("gender=" + m.gender);
features.incrementCount("person-fine=" + m.person);
features.incrementCount("head-ne-type=" + m.nerString);
List<String> singletonFeatures = m.getSingletonFeatures(dictionaries);
for (Map.Entry<Integer, String> e : SINGLETON_FEATURES.entrySet()) {
if (e.getKey() < singletonFeatures.size()) {
features.incrementCount(e.getValue() + "=" + singletonFeatures.get(e.getKey()));
}
}
// length and location features
addNumeric(features, "mention-length", m.spanToString().length());
addNumeric(features, "mention-words", m.originalSpan.size());
addNumeric(features, "sentence-words", m.sentenceWords.size());
features.incrementCount("sentence-words=" + bin(m.sentenceWords.size()));
features.incrementCount("mention-position", m.mentionNum / (double) doc.predictedMentions.size());
features.incrementCount("sentence-position", m.sentNum / (double) doc.numSentences);
// lexical features
CoreLabel firstWord = firstWord(m);
CoreLabel lastWord = lastWord(m);
CoreLabel headWord = headWord(m);
CoreLabel prevWord = prevWord(m);
CoreLabel nextWord = nextWord(m);
CoreLabel prevprevWord = prevprevWord(m);
CoreLabel nextnextWord = nextnextWord(m);
String headPOS = getPOS(headWord);
String firstPOS = getPOS(firstWord);
String lastPOS = getPOS(lastWord);
String prevPOS = getPOS(prevWord);
String nextPOS = getPOS(nextWord);
String prevprevPOS = getPOS(prevprevWord);
String nextnextPOS = getPOS(nextnextWord);
features.incrementCount("first-word=" + wordIndicator(firstWord, firstPOS));
features.incrementCount("last-word=" + wordIndicator(lastWord, lastPOS));
features.incrementCount("head-word=" + wordIndicator(headWord, headPOS));
features.incrementCount("next-word=" + wordIndicator(nextWord, nextPOS));
features.incrementCount("prev-word=" + wordIndicator(prevWord, prevPOS));
features.incrementCount("next-bigram=" + wordIndicator(nextWord, nextnextWord, nextPOS + "_" + nextnextPOS));
features.incrementCount("prev-bigram=" + wordIndicator(prevprevWord, prevWord, prevprevPOS + "_" + prevPOS));
features.incrementCount("next-pos=" + nextPOS);
features.incrementCount("prev-pos=" + prevPOS);
features.incrementCount("first-pos=" + firstPOS);
features.incrementCount("last-pos=" + lastPOS);
features.incrementCount("next-pos-bigram=" + nextPOS + "_" + nextnextPOS);
features.incrementCount("prev-pos-bigram=" + prevprevPOS + "_" + prevPOS);
addDependencyFeatures(features, "parent", getDependencyParent(m), true);
addFeature(features, "ends-with-head", m.headIndex == m.endIndex - 1);
addFeature(features, "is-generic", m.originalSpan.size() == 1 && firstPOS.equals("NNS"));
// syntax features
IndexedWord w = m.headIndexedWord;
String depPath = "";
int depth = 0;
while (w != null) {
SemanticGraphEdge e = getDependencyParent(m, w);
depth++;
if (depth <= 3 && e != null) {
depPath += (depPath.isEmpty() ? "" : "_") + e.getRelation().toString();
features.incrementCount("dep-path=" + depPath);
w = e.getSource();
} else {
w = null;
}
}
if (useConstituencyParse) {
int fullEmbeddingLevel = headEmbeddingLevel(m.contextParseTree, m.headIndex);
int mentionEmbeddingLevel = headEmbeddingLevel(m.mentionSubTree, m.headIndex - m.startIndex);
if (fullEmbeddingLevel != -1 && mentionEmbeddingLevel != -1) {
features.incrementCount("mention-embedding-level=" + bin(fullEmbeddingLevel - mentionEmbeddingLevel));
features.incrementCount("head-embedding-level=" + bin(mentionEmbeddingLevel));
} else {
features.incrementCount("undetermined-embedding-level");
}
features.incrementCount("num-embedded-nps=" + bin(numEmbeddedNps(m.mentionSubTree)));
String syntaxPath = "";
Tree tree = m.contextParseTree;
Tree head = tree.getLeaves().get(m.headIndex).ancestor(1, tree);
depth = 0;
for (Tree node : tree.pathNodeToNode(head, tree)) {
syntaxPath += node.value() + "-";
features.incrementCount("syntax-path=" + syntaxPath);
depth++;
if (depth >= 4 || node.value().equals("S")) {
break;
}
}
}
// mention containment features
addFeature(features, "contained-in-other-mention", mentionsByHeadIndex.get(m.headIndex).stream().anyMatch(m2 -> m != m2 && m.insideIn(m2)));
addFeature(features, "contains-other-mention", mentionsByHeadIndex.get(m.headIndex).stream().anyMatch(m2 -> m != m2 && m2.insideIn(m)));
// features from dcoref rules
addFeature(features, "bare-plural", m.originalSpan.size() == 1 && headPOS.equals("NNS"));
addFeature(features, "quantifier-start", dictionaries.quantifiers.contains(firstWord.word().toLowerCase()));
addFeature(features, "negative-start", firstWord.word().toLowerCase().matches("none|no|nothing|not"));
addFeature(features, "partitive", RuleBasedCorefMentionFinder.partitiveRule(m, m.sentenceWords, dictionaries));
addFeature(features, "adjectival-demonym", dictionaries.isAdjectivalDemonym(m.spanToString()));
if (doc.docType != DocType.ARTICLE && m.person == Person.YOU && nextWord != null && nextWord.word().equalsIgnoreCase("know")) {
features.incrementCount("generic-you");
}
return features;
}
use of edu.stanford.nlp.coref.data.Document in project CoreNLP by stanfordnlp.
the class HybridCorefPrinter method linkDistanceAnalysis.
public static void linkDistanceAnalysis(String[] args) throws Exception {
Properties props = StringUtils.argsToProperties(args);
HybridCorefSystem cs = new HybridCorefSystem(props);
cs.docMaker.resetDocs();
Counter<Integer> proper = new ClassicCounter<>();
Counter<Integer> common = new ClassicCounter<>();
Counter<Integer> pronoun = new ClassicCounter<>();
Counter<Integer> list = new ClassicCounter<>();
while (true) {
Document document = cs.docMaker.nextDoc();
if (document == null)
break;
for (int sentIdx = 0; sentIdx < document.predictedMentions.size(); sentIdx++) {
List<Mention> predictedInSent = document.predictedMentions.get(sentIdx);
for (int mIdx = 0; mIdx < predictedInSent.size(); mIdx++) {
Mention m = predictedInSent.get(mIdx);
loop: for (int distance = 0; distance <= sentIdx; distance++) {
List<Mention> candidates = Sieve.getOrderedAntecedents(m, sentIdx - distance, mIdx, document.predictedMentions, cs.dictionaries);
for (Mention candidate : candidates) {
if (candidate == m)
continue;
// ignore cataphora
if (distance == 0 && m.appearEarlierThan(candidate))
continue;
if (candidate.goldCorefClusterID == m.goldCorefClusterID) {
switch(m.mentionType) {
case NOMINAL:
if (candidate.mentionType == MentionType.NOMINAL || candidate.mentionType == MentionType.PROPER) {
common.incrementCount(distance);
break loop;
}
break;
case PROPER:
if (candidate.mentionType == MentionType.PROPER) {
proper.incrementCount(distance);
break loop;
}
break;
case PRONOMINAL:
pronoun.incrementCount(distance);
break loop;
case LIST:
if (candidate.mentionType == MentionType.LIST) {
list.incrementCount(distance);
break loop;
}
break;
default:
break;
}
}
}
}
}
}
}
System.out.println("PROPER -------------------------------------------");
Counters.printCounterSortedByKeys(proper);
System.out.println("COMMON -------------------------------------------");
Counters.printCounterSortedByKeys(common);
System.out.println("PRONOUN -------------------------------------------");
Counters.printCounterSortedByKeys(pronoun);
System.out.println("LIST -------------------------------------------");
Counters.printCounterSortedByKeys(list);
log.info();
}
Aggregations