use of edu.stanford.nlp.coref.data.Document in project CoreNLP by stanfordnlp.
the class CorefSystem method annotate.
public void annotate(Annotation ann) {
Document document;
try {
document = docMaker.makeDocument(ann);
} catch (Exception e) {
throw new RuntimeException("Error making document", e);
}
CorefUtils.checkForInterrupt();
corefAlgorithm.runCoref(document);
if (removeSingletonClusters) {
CorefUtils.removeSingletonClusters(document);
}
CorefUtils.checkForInterrupt();
Map<Integer, CorefChain> result = Generics.newHashMap();
for (CorefCluster c : document.corefClusters.values()) {
result.put(c.clusterID, new CorefChain(c, document.positions));
}
ann.set(CorefCoreAnnotations.CorefChainAnnotation.class, result);
}
use of edu.stanford.nlp.coref.data.Document in project CoreNLP by stanfordnlp.
the class CorefSystem method runOnConll.
public void runOnConll(Properties props) throws Exception {
File f = new File(CorefProperties.conllOutputPath(props));
if (!f.exists()) {
f.mkdirs();
}
String timestamp = Calendar.getInstance().getTime().toString().replaceAll("\\s", "-").replaceAll(":", "-");
String baseName = CorefProperties.conllOutputPath(props) + timestamp;
String goldOutput = baseName + ".gold.txt";
String beforeCorefOutput = baseName + ".predicted.txt";
String afterCorefOutput = baseName + ".coref.predicted.txt";
PrintWriter writerGold = new PrintWriter(new FileOutputStream(goldOutput));
PrintWriter writerBeforeCoref = new PrintWriter(new FileOutputStream(beforeCorefOutput));
PrintWriter writerAfterCoref = new PrintWriter(new FileOutputStream(afterCorefOutput));
Logger logger = Logger.getLogger(CorefSystem.class.getName());
initLogger(logger, baseName + ".log");
logger.info(timestamp);
logger.info(props.toString());
(new CorefDocumentProcessor() {
@Override
public void process(int id, Document document) {
writerGold.print(CorefPrinter.printConllOutput(document, true));
writerBeforeCoref.print(CorefPrinter.printConllOutput(document, false));
long time = System.currentTimeMillis();
corefAlgorithm.runCoref(document);
if (verbose) {
Redwood.log(getName(), "Coref took " + (System.currentTimeMillis() - time) / 1000.0 + "s");
}
CorefUtils.removeSingletonClusters(document);
if (verbose) {
CorefUtils.printHumanReadableCoref(document);
}
if (document.filterMentionSet != null) {
Map<Integer, CorefCluster> filteredClusters = document.corefClusters.values().stream().filter(x -> CorefUtils.filterClustersWithMentionSpans(x, document.filterMentionSet)).collect(Collectors.toMap(x -> x.clusterID, x -> x));
writerAfterCoref.print(CorefPrinter.printConllOutput(document, false, true, filteredClusters));
} else {
writerAfterCoref.print(CorefPrinter.printConllOutput(document, false, true));
}
}
@Override
public void finish() throws Exception {
}
@Override
public String getName() {
return corefAlgorithm.getClass().getName();
}
}).run(docMaker);
String summary = CorefScorer.getEvalSummary(CorefProperties.getScorerPath(props), goldOutput, beforeCorefOutput);
logger.info("Before Coref");
CorefScorer.printScoreSummary(summary, logger, false);
CorefScorer.printScoreSummary(summary, logger, true);
CorefScorer.printFinalConllScore(summary, logger);
summary = CorefScorer.getEvalSummary(CorefProperties.getScorerPath(props), goldOutput, afterCorefOutput);
logger.info("After Coref");
CorefScorer.printScoreSummary(summary, logger, false);
CorefScorer.printScoreSummary(summary, logger, true);
CorefScorer.printFinalConllScore(summary, logger);
writerGold.close();
writerBeforeCoref.close();
writerAfterCoref.close();
}
use of edu.stanford.nlp.coref.data.Document in project CoreNLP by stanfordnlp.
the class CorefDocumentProcessor method run.
public default void run(DocumentMaker docMaker) throws Exception {
Redwood.hideChannelsEverywhere("debug-mention", "debug-preprocessor", "debug-docreader", "debug-md");
int docId = 0;
Document document = docMaker.nextDoc();
long time = System.currentTimeMillis();
while (document != null) {
process(docId, document);
Redwood.log(getName(), "Processed document " + docId + " in " + (System.currentTimeMillis() - time) / 1000.0 + "s");
time = System.currentTimeMillis();
docId++;
document = docMaker.nextDoc();
}
finish();
}
use of edu.stanford.nlp.coref.data.Document in project CoreNLP by stanfordnlp.
the class FeatureExtractor method getFeatures.
private Counter<String> getFeatures(Document doc, Mention m, Map<Integer, List<Mention>> mentionsByHeadIndex) {
Counter<String> features = new ClassicCounter<>();
// type features
features.incrementCount("mention-type=" + m.mentionType);
features.incrementCount("gender=" + m.gender);
features.incrementCount("person-fine=" + m.person);
features.incrementCount("head-ne-type=" + m.nerString);
List<String> singletonFeatures = m.getSingletonFeatures(dictionaries);
for (Map.Entry<Integer, String> e : SINGLETON_FEATURES.entrySet()) {
if (e.getKey() < singletonFeatures.size()) {
features.incrementCount(e.getValue() + "=" + singletonFeatures.get(e.getKey()));
}
}
// length and location features
addNumeric(features, "mention-length", m.spanToString().length());
addNumeric(features, "mention-words", m.originalSpan.size());
addNumeric(features, "sentence-words", m.sentenceWords.size());
features.incrementCount("sentence-words=" + bin(m.sentenceWords.size()));
features.incrementCount("mention-position", m.mentionNum / (double) doc.predictedMentions.size());
features.incrementCount("sentence-position", m.sentNum / (double) doc.numSentences);
// lexical features
CoreLabel firstWord = firstWord(m);
CoreLabel lastWord = lastWord(m);
CoreLabel headWord = headWord(m);
CoreLabel prevWord = prevWord(m);
CoreLabel nextWord = nextWord(m);
CoreLabel prevprevWord = prevprevWord(m);
CoreLabel nextnextWord = nextnextWord(m);
String headPOS = getPOS(headWord);
String firstPOS = getPOS(firstWord);
String lastPOS = getPOS(lastWord);
String prevPOS = getPOS(prevWord);
String nextPOS = getPOS(nextWord);
String prevprevPOS = getPOS(prevprevWord);
String nextnextPOS = getPOS(nextnextWord);
features.incrementCount("first-word=" + wordIndicator(firstWord, firstPOS));
features.incrementCount("last-word=" + wordIndicator(lastWord, lastPOS));
features.incrementCount("head-word=" + wordIndicator(headWord, headPOS));
features.incrementCount("next-word=" + wordIndicator(nextWord, nextPOS));
features.incrementCount("prev-word=" + wordIndicator(prevWord, prevPOS));
features.incrementCount("next-bigram=" + wordIndicator(nextWord, nextnextWord, nextPOS + "_" + nextnextPOS));
features.incrementCount("prev-bigram=" + wordIndicator(prevprevWord, prevWord, prevprevPOS + "_" + prevPOS));
features.incrementCount("next-pos=" + nextPOS);
features.incrementCount("prev-pos=" + prevPOS);
features.incrementCount("first-pos=" + firstPOS);
features.incrementCount("last-pos=" + lastPOS);
features.incrementCount("next-pos-bigram=" + nextPOS + "_" + nextnextPOS);
features.incrementCount("prev-pos-bigram=" + prevprevPOS + "_" + prevPOS);
addDependencyFeatures(features, "parent", getDependencyParent(m), true);
addFeature(features, "ends-with-head", m.headIndex == m.endIndex - 1);
addFeature(features, "is-generic", m.originalSpan.size() == 1 && firstPOS.equals("NNS"));
// syntax features
IndexedWord w = m.headIndexedWord;
String depPath = "";
int depth = 0;
while (w != null) {
SemanticGraphEdge e = getDependencyParent(m, w);
depth++;
if (depth <= 3 && e != null) {
depPath += (depPath.isEmpty() ? "" : "_") + e.getRelation().toString();
features.incrementCount("dep-path=" + depPath);
w = e.getSource();
} else {
w = null;
}
}
if (useConstituencyParse) {
int fullEmbeddingLevel = headEmbeddingLevel(m.contextParseTree, m.headIndex);
int mentionEmbeddingLevel = headEmbeddingLevel(m.mentionSubTree, m.headIndex - m.startIndex);
if (fullEmbeddingLevel != -1 && mentionEmbeddingLevel != -1) {
features.incrementCount("mention-embedding-level=" + bin(fullEmbeddingLevel - mentionEmbeddingLevel));
features.incrementCount("head-embedding-level=" + bin(mentionEmbeddingLevel));
} else {
features.incrementCount("undetermined-embedding-level");
}
features.incrementCount("num-embedded-nps=" + bin(numEmbeddedNps(m.mentionSubTree)));
String syntaxPath = "";
Tree tree = m.contextParseTree;
Tree head = tree.getLeaves().get(m.headIndex).ancestor(1, tree);
depth = 0;
for (Tree node : tree.pathNodeToNode(head, tree)) {
syntaxPath += node.value() + "-";
features.incrementCount("syntax-path=" + syntaxPath);
depth++;
if (depth >= 4 || node.value().equals("S")) {
break;
}
}
}
// mention containment features
addFeature(features, "contained-in-other-mention", mentionsByHeadIndex.get(m.headIndex).stream().anyMatch(m2 -> m != m2 && m.insideIn(m2)));
addFeature(features, "contains-other-mention", mentionsByHeadIndex.get(m.headIndex).stream().anyMatch(m2 -> m != m2 && m2.insideIn(m)));
// features from dcoref rules
addFeature(features, "bare-plural", m.originalSpan.size() == 1 && headPOS.equals("NNS"));
addFeature(features, "quantifier-start", dictionaries.quantifiers.contains(firstWord.word().toLowerCase()));
addFeature(features, "negative-start", firstWord.word().toLowerCase().matches("none|no|nothing|not"));
addFeature(features, "partitive", RuleBasedCorefMentionFinder.partitiveRule(m, m.sentenceWords, dictionaries));
addFeature(features, "adjectival-demonym", dictionaries.isAdjectivalDemonym(m.spanToString()));
if (doc.docType != DocType.ARTICLE && m.person == Person.YOU && nextWord != null && nextWord.word().equalsIgnoreCase("know")) {
features.incrementCount("generic-you");
}
return features;
}
use of edu.stanford.nlp.coref.data.Document in project CoreNLP by stanfordnlp.
the class HybridCorefSystem method runCoref.
public static void runCoref(Properties props) throws Exception {
/*
* property, environment setting
*/
Redwood.hideChannelsEverywhere("debug-cluster", "debug-mention", "debug-preprocessor", "debug-docreader", "debug-mergethres", "debug-featureselection", "debug-md");
int nThreads = HybridCorefProperties.getThreadCounts(props);
String timeStamp = Calendar.getInstance().getTime().toString().replaceAll("\\s", "-").replaceAll(":", "-");
Logger logger = Logger.getLogger(HybridCorefSystem.class.getName());
// set log file path
if (props.containsKey(HybridCorefProperties.LOG_PROP)) {
File logFile = new File(props.getProperty(HybridCorefProperties.LOG_PROP));
RedwoodConfiguration.current().handlers(RedwoodConfiguration.Handlers.file(logFile)).apply();
Redwood.log("Starting coref log");
}
log.info(props.toString());
if (HybridCorefProperties.checkMemory(props))
checkMemoryUsage();
HybridCorefSystem cs = new HybridCorefSystem(props);
/*
output setting
*/
// prepare conll output
String goldOutput = null;
String beforeCorefOutput = null;
String afterCorefOutput = null;
PrintWriter writerGold = null;
PrintWriter writerBeforeCoref = null;
PrintWriter writerAfterCoref = null;
if (HybridCorefProperties.doScore(props)) {
String pathOutput = CorefProperties.conllOutputPath(props);
(new File(pathOutput)).mkdir();
goldOutput = pathOutput + "output-" + timeStamp + ".gold.txt";
beforeCorefOutput = pathOutput + "output-" + timeStamp + ".predicted.txt";
afterCorefOutput = pathOutput + "output-" + timeStamp + ".coref.predicted.txt";
writerGold = new PrintWriter(new FileOutputStream(goldOutput));
writerBeforeCoref = new PrintWriter(new FileOutputStream(beforeCorefOutput));
writerAfterCoref = new PrintWriter(new FileOutputStream(afterCorefOutput));
}
// run coref
MulticoreWrapper<Pair<Document, HybridCorefSystem>, StringBuilder[]> wrapper = new MulticoreWrapper<>(nThreads, new ThreadsafeProcessor<Pair<Document, HybridCorefSystem>, StringBuilder[]>() {
@Override
public StringBuilder[] process(Pair<Document, HybridCorefSystem> input) {
try {
Document document = input.first;
HybridCorefSystem cs = input.second;
// conll output and logs
StringBuilder[] outputs = new StringBuilder[4];
cs.coref(document, outputs);
return outputs;
} catch (Exception e) {
throw new RuntimeException(e);
}
}
@Override
public ThreadsafeProcessor<Pair<Document, HybridCorefSystem>, StringBuilder[]> newInstance() {
return this;
}
});
Date startTime = null;
if (HybridCorefProperties.checkTime(props)) {
startTime = new Date();
System.err.printf("END-TO-END COREF Start time: %s\n", startTime);
}
// run processes
int docCnt = 0;
while (true) {
Document document = cs.docMaker.nextDoc();
if (document == null)
break;
wrapper.put(Pair.makePair(document, cs));
docCnt = logOutput(wrapper, writerGold, writerBeforeCoref, writerAfterCoref, docCnt);
}
// Finished reading the input. Wait for jobs to finish
wrapper.join();
docCnt = logOutput(wrapper, writerGold, writerBeforeCoref, writerAfterCoref, docCnt);
IOUtils.closeIgnoringExceptions(writerGold);
IOUtils.closeIgnoringExceptions(writerBeforeCoref);
IOUtils.closeIgnoringExceptions(writerAfterCoref);
if (HybridCorefProperties.checkTime(props)) {
System.err.printf("END-TO-END COREF Elapsed time: %.3f seconds\n", (((new Date()).getTime() - startTime.getTime()) / 1000F));
// System.err.printf("CORENLP PROCESS TIME TOTAL: %.3f seconds\n", cs.mentionExtractor.corenlpProcessTime);
}
if (HybridCorefProperties.checkMemory(props))
checkMemoryUsage();
// scoring
if (HybridCorefProperties.doScore(props)) {
String summary = CorefScorer.getEvalSummary(CorefProperties.getScorerPath(props), goldOutput, beforeCorefOutput);
CorefScorer.printScoreSummary(summary, logger, false);
summary = CorefScorer.getEvalSummary(CorefProperties.getScorerPath(props), goldOutput, afterCorefOutput);
CorefScorer.printScoreSummary(summary, logger, true);
CorefScorer.printFinalConllScore(summary, logger);
}
}
Aggregations