use of edu.stanford.nlp.coref.data.Dictionaries in project CoreNLP by stanfordnlp.
the class FastNeuralCorefDataExporter method main.
public static void main(String[] args) throws Exception {
Properties props = StringUtils.argsToProperties(args);
props.setProperty("coref.maxMentionDistance", "50");
props.setProperty("coref.maxMentionDistanceWithStringMatch", "1000");
props.setProperty("coref.conllOutputPath", "/Users/kevinclark/Programming/research/coref/conll-2012/output");
props.setProperty("coref.data", "/Users/kevinclark/Programming/research/coref/conll-2012");
props.setProperty("coref.scorer", "/Users/kevinclark/Programming/research/coref/conll-2012/scorer/v8.01/scorer.pl");
Dictionaries dictionaries = new Dictionaries(props);
String outputPath = "/Users/kevinclark/Programming/research/coref/data";
String dataPath = outputPath + "/raw/";
String goldClusterPath = outputPath + "/gold/";
String compressorPath = outputPath + "/";
IOUtils.ensureDir(new File(outputPath));
IOUtils.ensureDir(new File(dataPath));
IOUtils.ensureDir(new File(goldClusterPath));
IOUtils.ensureDir(new File(compressorPath));
Compressor<String> compressor = new Compressor<String>();
for (Dataset dataset : Arrays.asList(Dataset.TRAIN, Dataset.DEV, Dataset.TEST)) {
CorefProperties.setInput(props, dataset);
System.out.println(CorefProperties.getInputPath(props));
new FastNeuralCorefDataExporter(props, dictionaries, compressor, dataPath + dataset.toString().toLowerCase(), goldClusterPath + dataset.toString().toLowerCase()).run(props, dictionaries);
}
writeCompressor(compressor, compressorPath + "/compression");
}
use of edu.stanford.nlp.coref.data.Dictionaries in project CoreNLP by stanfordnlp.
the class StatisticalCorefTrainer method doTraining.
public static void doTraining(Properties props) throws Exception {
setTrainingPath(props);
Dictionaries dictionaries = new Dictionaries(props);
setDataPath("train");
wordCountsFile = trainingPath + "train/word_counts.ser";
CorefProperties.setInput(props, Dataset.TRAIN);
preprocess(props, dictionaries, true);
setDataPath("dev");
CorefProperties.setInput(props, Dataset.DEV);
preprocess(props, dictionaries, false);
setDataPath("train");
dictionaries = null;
PairwiseModel classificationModel = PairwiseModel.newBuilder(CLASSIFICATION_MODEL, MetaFeatureExtractor.newBuilder().build()).build();
PairwiseModel rankingModel = PairwiseModel.newBuilder(RANKING_MODEL, MetaFeatureExtractor.newBuilder().build()).build();
PairwiseModel anaphoricityModel = PairwiseModel.newBuilder(ANAPHORICITY_MODEL, MetaFeatureExtractor.anaphoricityMFE()).trainingExamples(5000000).build();
PairwiseModelTrainer.trainRanking(rankingModel);
PairwiseModelTrainer.trainClassification(classificationModel, false);
PairwiseModelTrainer.trainClassification(anaphoricityModel, true);
setDataPath("dev");
PairwiseModelTrainer.test(classificationModel, predictionsName, false);
PairwiseModelTrainer.test(rankingModel, predictionsName, false);
PairwiseModelTrainer.test(anaphoricityModel, predictionsName, true);
new Clusterer().doTraining(CLUSTERING_MODEL_NAME);
}
use of edu.stanford.nlp.coref.data.Dictionaries in project CoreNLP by stanfordnlp.
the class FeatureExtractor method getFeatures.
private Counter<String> getFeatures(Document doc, Mention m, Map<Integer, List<Mention>> mentionsByHeadIndex) {
Counter<String> features = new ClassicCounter<>();
// type features
features.incrementCount("mention-type=" + m.mentionType);
features.incrementCount("gender=" + m.gender);
features.incrementCount("person-fine=" + m.person);
features.incrementCount("head-ne-type=" + m.nerString);
List<String> singletonFeatures = m.getSingletonFeatures(dictionaries);
for (Map.Entry<Integer, String> e : SINGLETON_FEATURES.entrySet()) {
if (e.getKey() < singletonFeatures.size()) {
features.incrementCount(e.getValue() + "=" + singletonFeatures.get(e.getKey()));
}
}
// length and location features
addNumeric(features, "mention-length", m.spanToString().length());
addNumeric(features, "mention-words", m.originalSpan.size());
addNumeric(features, "sentence-words", m.sentenceWords.size());
features.incrementCount("sentence-words=" + bin(m.sentenceWords.size()));
features.incrementCount("mention-position", m.mentionNum / (double) doc.predictedMentions.size());
features.incrementCount("sentence-position", m.sentNum / (double) doc.numSentences);
// lexical features
CoreLabel firstWord = firstWord(m);
CoreLabel lastWord = lastWord(m);
CoreLabel headWord = headWord(m);
CoreLabel prevWord = prevWord(m);
CoreLabel nextWord = nextWord(m);
CoreLabel prevprevWord = prevprevWord(m);
CoreLabel nextnextWord = nextnextWord(m);
String headPOS = getPOS(headWord);
String firstPOS = getPOS(firstWord);
String lastPOS = getPOS(lastWord);
String prevPOS = getPOS(prevWord);
String nextPOS = getPOS(nextWord);
String prevprevPOS = getPOS(prevprevWord);
String nextnextPOS = getPOS(nextnextWord);
features.incrementCount("first-word=" + wordIndicator(firstWord, firstPOS));
features.incrementCount("last-word=" + wordIndicator(lastWord, lastPOS));
features.incrementCount("head-word=" + wordIndicator(headWord, headPOS));
features.incrementCount("next-word=" + wordIndicator(nextWord, nextPOS));
features.incrementCount("prev-word=" + wordIndicator(prevWord, prevPOS));
features.incrementCount("next-bigram=" + wordIndicator(nextWord, nextnextWord, nextPOS + "_" + nextnextPOS));
features.incrementCount("prev-bigram=" + wordIndicator(prevprevWord, prevWord, prevprevPOS + "_" + prevPOS));
features.incrementCount("next-pos=" + nextPOS);
features.incrementCount("prev-pos=" + prevPOS);
features.incrementCount("first-pos=" + firstPOS);
features.incrementCount("last-pos=" + lastPOS);
features.incrementCount("next-pos-bigram=" + nextPOS + "_" + nextnextPOS);
features.incrementCount("prev-pos-bigram=" + prevprevPOS + "_" + prevPOS);
addDependencyFeatures(features, "parent", getDependencyParent(m), true);
addFeature(features, "ends-with-head", m.headIndex == m.endIndex - 1);
addFeature(features, "is-generic", m.originalSpan.size() == 1 && firstPOS.equals("NNS"));
// syntax features
IndexedWord w = m.headIndexedWord;
String depPath = "";
int depth = 0;
while (w != null) {
SemanticGraphEdge e = getDependencyParent(m, w);
depth++;
if (depth <= 3 && e != null) {
depPath += (depPath.isEmpty() ? "" : "_") + e.getRelation().toString();
features.incrementCount("dep-path=" + depPath);
w = e.getSource();
} else {
w = null;
}
}
if (useConstituencyParse) {
int fullEmbeddingLevel = headEmbeddingLevel(m.contextParseTree, m.headIndex);
int mentionEmbeddingLevel = headEmbeddingLevel(m.mentionSubTree, m.headIndex - m.startIndex);
if (fullEmbeddingLevel != -1 && mentionEmbeddingLevel != -1) {
features.incrementCount("mention-embedding-level=" + bin(fullEmbeddingLevel - mentionEmbeddingLevel));
features.incrementCount("head-embedding-level=" + bin(mentionEmbeddingLevel));
} else {
features.incrementCount("undetermined-embedding-level");
}
features.incrementCount("num-embedded-nps=" + bin(numEmbeddedNps(m.mentionSubTree)));
String syntaxPath = "";
Tree tree = m.contextParseTree;
Tree head = tree.getLeaves().get(m.headIndex).ancestor(1, tree);
depth = 0;
for (Tree node : tree.pathNodeToNode(head, tree)) {
syntaxPath += node.value() + "-";
features.incrementCount("syntax-path=" + syntaxPath);
depth++;
if (depth >= 4 || node.value().equals("S")) {
break;
}
}
}
// mention containment features
addFeature(features, "contained-in-other-mention", mentionsByHeadIndex.get(m.headIndex).stream().anyMatch(m2 -> m != m2 && m.insideIn(m2)));
addFeature(features, "contains-other-mention", mentionsByHeadIndex.get(m.headIndex).stream().anyMatch(m2 -> m != m2 && m2.insideIn(m)));
// features from dcoref rules
addFeature(features, "bare-plural", m.originalSpan.size() == 1 && headPOS.equals("NNS"));
addFeature(features, "quantifier-start", dictionaries.quantifiers.contains(firstWord.word().toLowerCase()));
addFeature(features, "negative-start", firstWord.word().toLowerCase().matches("none|no|nothing|not"));
addFeature(features, "partitive", RuleBasedCorefMentionFinder.partitiveRule(m, m.sentenceWords, dictionaries));
addFeature(features, "adjectival-demonym", dictionaries.isAdjectivalDemonym(m.spanToString()));
if (doc.docType != DocType.ARTICLE && m.person == Person.YOU && nextWord != null && nextWord.word().equalsIgnoreCase("know")) {
features.incrementCount("generic-you");
}
return features;
}
use of edu.stanford.nlp.coref.data.Dictionaries in project CoreNLP by stanfordnlp.
the class SingletonPredictor method generateFeatureVectors.
/**
* Generate the training features from the CoNLL input file.
* @return Dataset of feature vectors
* @throws Exception
*/
private static GeneralDataset<String, String> generateFeatureVectors(Properties props) throws Exception {
GeneralDataset<String, String> dataset = new Dataset<>();
Dictionaries dict = new Dictionaries(props);
DocumentMaker docMaker = new DocumentMaker(props, dict);
Document document;
while ((document = docMaker.nextDoc()) != null) {
setTokenIndices(document);
Map<Integer, CorefCluster> entities = document.goldCorefClusters;
// Generate features for coreferent mentions with class label 1
for (CorefCluster entity : entities.values()) {
for (Mention mention : entity.getCorefMentions()) {
// Ignore verbal mentions
if (mention.headWord.tag().startsWith("V"))
continue;
IndexedWord head = mention.enhancedDependency.getNodeByIndexSafe(mention.headWord.index());
if (head == null)
continue;
ArrayList<String> feats = mention.getSingletonFeatures(dict);
dataset.add(new BasicDatum<>(feats, "1"));
}
}
// Generate features for singletons with class label 0
ArrayList<CoreLabel> gold_heads = new ArrayList<>();
for (Mention gold_men : document.goldMentionsByID.values()) {
gold_heads.add(gold_men.headWord);
}
for (Mention predicted_men : document.predictedMentionsByID.values()) {
SemanticGraph dep = predicted_men.enhancedDependency;
IndexedWord head = dep.getNodeByIndexSafe(predicted_men.headWord.index());
if (head == null || !dep.vertexSet().contains(head))
continue;
// Ignore verbal mentions
if (predicted_men.headWord.tag().startsWith("V"))
continue;
// If the mention is in the gold set, it is not a singleton and thus ignore
if (gold_heads.contains(predicted_men.headWord))
continue;
dataset.add(new BasicDatum<>(predicted_men.getSingletonFeatures(dict), "0"));
}
}
dataset.summaryStatistics();
return dataset;
}
use of edu.stanford.nlp.coref.data.Dictionaries in project CoreNLP by stanfordnlp.
the class MentionDetectionEvaluator method main.
public static void main(String[] args) throws Exception {
Properties props = StringUtils.argsToProperties(new String[] { "-props", args[0] });
Dictionaries dictionaries = new Dictionaries(props);
CorefProperties.setInput(props, Dataset.TRAIN);
new MentionDetectionEvaluator().run(props, dictionaries);
}
Aggregations