Examples with Document - edu.stanford.nlp.coref.data.Document

Example 1 with Document

use of edu.stanford.nlp.coref.data.Document in project CoreNLP by stanfordnlp.

the class CorefSystem method annotate.

public void annotate(Annotation ann) {
    Document document;
    try {
        document = docMaker.makeDocument(ann);
    } catch (Exception e) {
        throw new RuntimeException("Error making document", e);
    }
    CorefUtils.checkForInterrupt();
    corefAlgorithm.runCoref(document);
    if (removeSingletonClusters) {
        CorefUtils.removeSingletonClusters(document);
    }
    CorefUtils.checkForInterrupt();
    Map<Integer, CorefChain> result = Generics.newHashMap();
    for (CorefCluster c : document.corefClusters.values()) {
        result.put(c.clusterID, new CorefChain(c, document.positions));
    }
    ann.set(CorefCoreAnnotations.CorefChainAnnotation.class, result);
}

Also used : CorefCluster(edu.stanford.nlp.coref.data.CorefCluster) CorefChain(edu.stanford.nlp.coref.data.CorefChain) Document(edu.stanford.nlp.coref.data.Document) IOException(java.io.IOException)

Example 2 with Document

use of edu.stanford.nlp.coref.data.Document in project CoreNLP by stanfordnlp.

the class CorefSystem method runOnConll.

public void runOnConll(Properties props) throws Exception {
    File f = new File(CorefProperties.conllOutputPath(props));
    if (!f.exists()) {
        f.mkdirs();
    }
    String timestamp = Calendar.getInstance().getTime().toString().replaceAll("\\s", "-").replaceAll(":", "-");
    String baseName = CorefProperties.conllOutputPath(props) + timestamp;
    String goldOutput = baseName + ".gold.txt";
    String beforeCorefOutput = baseName + ".predicted.txt";
    String afterCorefOutput = baseName + ".coref.predicted.txt";
    PrintWriter writerGold = new PrintWriter(new FileOutputStream(goldOutput));
    PrintWriter writerBeforeCoref = new PrintWriter(new FileOutputStream(beforeCorefOutput));
    PrintWriter writerAfterCoref = new PrintWriter(new FileOutputStream(afterCorefOutput));
    Logger logger = Logger.getLogger(CorefSystem.class.getName());
    initLogger(logger, baseName + ".log");
    logger.info(timestamp);
    logger.info(props.toString());
    (new CorefDocumentProcessor() {

        @Override
        public void process(int id, Document document) {
            writerGold.print(CorefPrinter.printConllOutput(document, true));
            writerBeforeCoref.print(CorefPrinter.printConllOutput(document, false));
            long time = System.currentTimeMillis();
            corefAlgorithm.runCoref(document);
            if (verbose) {
                Redwood.log(getName(), "Coref took " + (System.currentTimeMillis() - time) / 1000.0 + "s");
            }
            CorefUtils.removeSingletonClusters(document);
            if (verbose) {
                CorefUtils.printHumanReadableCoref(document);
            }
            if (document.filterMentionSet != null) {
                Map<Integer, CorefCluster> filteredClusters = document.corefClusters.values().stream().filter(x -> CorefUtils.filterClustersWithMentionSpans(x, document.filterMentionSet)).collect(Collectors.toMap(x -> x.clusterID, x -> x));
                writerAfterCoref.print(CorefPrinter.printConllOutput(document, false, true, filteredClusters));
            } else {
                writerAfterCoref.print(CorefPrinter.printConllOutput(document, false, true));
            }
        }

        @Override
        public void finish() throws Exception {
        }

        @Override
        public String getName() {
            return corefAlgorithm.getClass().getName();
        }
    }).run(docMaker);
    String summary = CorefScorer.getEvalSummary(CorefProperties.getScorerPath(props), goldOutput, beforeCorefOutput);
    logger.info("Before Coref");
    CorefScorer.printScoreSummary(summary, logger, false);
    CorefScorer.printScoreSummary(summary, logger, true);
    CorefScorer.printFinalConllScore(summary, logger);
    summary = CorefScorer.getEvalSummary(CorefProperties.getScorerPath(props), goldOutput, afterCorefOutput);
    logger.info("After Coref");
    CorefScorer.printScoreSummary(summary, logger, false);
    CorefScorer.printScoreSummary(summary, logger, true);
    CorefScorer.printFinalConllScore(summary, logger);
    writerGold.close();
    writerBeforeCoref.close();
    writerAfterCoref.close();
}

Also used : PrintWriter(java.io.PrintWriter) NewlineLogFormatter(edu.stanford.nlp.util.logging.NewlineLogFormatter) Properties(java.util.Properties) CorefChain(edu.stanford.nlp.coref.data.CorefChain) Redwood(edu.stanford.nlp.util.logging.Redwood) FileOutputStream(java.io.FileOutputStream) IOException(java.io.IOException) DocumentMaker(edu.stanford.nlp.coref.data.DocumentMaker) Logger(java.util.logging.Logger) Dictionaries(edu.stanford.nlp.coref.data.Dictionaries) Collectors(java.util.stream.Collectors) File(java.io.File) Level(java.util.logging.Level) FileHandler(java.util.logging.FileHandler) CorefCluster(edu.stanford.nlp.coref.data.CorefCluster) Calendar(java.util.Calendar) Annotation(edu.stanford.nlp.pipeline.Annotation) StringUtils(edu.stanford.nlp.util.StringUtils) Map(java.util.Map) Document(edu.stanford.nlp.coref.data.Document) Generics(edu.stanford.nlp.util.Generics) Logger(java.util.logging.Logger) Document(edu.stanford.nlp.coref.data.Document) CorefCluster(edu.stanford.nlp.coref.data.CorefCluster) FileOutputStream(java.io.FileOutputStream) File(java.io.File) PrintWriter(java.io.PrintWriter)

Example 3 with Document

use of edu.stanford.nlp.coref.data.Document in project CoreNLP by stanfordnlp.

the class CorefDocumentProcessor method run.

public default void run(DocumentMaker docMaker) throws Exception {
    Redwood.hideChannelsEverywhere("debug-mention", "debug-preprocessor", "debug-docreader", "debug-md");
    int docId = 0;
    Document document = docMaker.nextDoc();
    long time = System.currentTimeMillis();
    while (document != null) {
        process(docId, document);
        Redwood.log(getName(), "Processed document " + docId + " in " + (System.currentTimeMillis() - time) / 1000.0 + "s");
        time = System.currentTimeMillis();
        docId++;
        document = docMaker.nextDoc();
    }
    finish();
}

Also used : Document(edu.stanford.nlp.coref.data.Document)

Example 4 with Document

use of edu.stanford.nlp.coref.data.Document in project CoreNLP by stanfordnlp.

the class FeatureExtractor method getFeatures.

private Counter<String> getFeatures(Document doc, Mention m, Map<Integer, List<Mention>> mentionsByHeadIndex) {
    Counter<String> features = new ClassicCounter<>();
    // type features
    features.incrementCount("mention-type=" + m.mentionType);
    features.incrementCount("gender=" + m.gender);
    features.incrementCount("person-fine=" + m.person);
    features.incrementCount("head-ne-type=" + m.nerString);
    List<String> singletonFeatures = m.getSingletonFeatures(dictionaries);
    for (Map.Entry<Integer, String> e : SINGLETON_FEATURES.entrySet()) {
        if (e.getKey() < singletonFeatures.size()) {
            features.incrementCount(e.getValue() + "=" + singletonFeatures.get(e.getKey()));
        }
    }
    // length and location features
    addNumeric(features, "mention-length", m.spanToString().length());
    addNumeric(features, "mention-words", m.originalSpan.size());
    addNumeric(features, "sentence-words", m.sentenceWords.size());
    features.incrementCount("sentence-words=" + bin(m.sentenceWords.size()));
    features.incrementCount("mention-position", m.mentionNum / (double) doc.predictedMentions.size());
    features.incrementCount("sentence-position", m.sentNum / (double) doc.numSentences);
    // lexical features
    CoreLabel firstWord = firstWord(m);
    CoreLabel lastWord = lastWord(m);
    CoreLabel headWord = headWord(m);
    CoreLabel prevWord = prevWord(m);
    CoreLabel nextWord = nextWord(m);
    CoreLabel prevprevWord = prevprevWord(m);
    CoreLabel nextnextWord = nextnextWord(m);
    String headPOS = getPOS(headWord);
    String firstPOS = getPOS(firstWord);
    String lastPOS = getPOS(lastWord);
    String prevPOS = getPOS(prevWord);
    String nextPOS = getPOS(nextWord);
    String prevprevPOS = getPOS(prevprevWord);
    String nextnextPOS = getPOS(nextnextWord);
    features.incrementCount("first-word=" + wordIndicator(firstWord, firstPOS));
    features.incrementCount("last-word=" + wordIndicator(lastWord, lastPOS));
    features.incrementCount("head-word=" + wordIndicator(headWord, headPOS));
    features.incrementCount("next-word=" + wordIndicator(nextWord, nextPOS));
    features.incrementCount("prev-word=" + wordIndicator(prevWord, prevPOS));
    features.incrementCount("next-bigram=" + wordIndicator(nextWord, nextnextWord, nextPOS + "_" + nextnextPOS));
    features.incrementCount("prev-bigram=" + wordIndicator(prevprevWord, prevWord, prevprevPOS + "_" + prevPOS));
    features.incrementCount("next-pos=" + nextPOS);
    features.incrementCount("prev-pos=" + prevPOS);
    features.incrementCount("first-pos=" + firstPOS);
    features.incrementCount("last-pos=" + lastPOS);
    features.incrementCount("next-pos-bigram=" + nextPOS + "_" + nextnextPOS);
    features.incrementCount("prev-pos-bigram=" + prevprevPOS + "_" + prevPOS);
    addDependencyFeatures(features, "parent", getDependencyParent(m), true);
    addFeature(features, "ends-with-head", m.headIndex == m.endIndex - 1);
    addFeature(features, "is-generic", m.originalSpan.size() == 1 && firstPOS.equals("NNS"));
    // syntax features
    IndexedWord w = m.headIndexedWord;
    String depPath = "";
    int depth = 0;
    while (w != null) {
        SemanticGraphEdge e = getDependencyParent(m, w);
        depth++;
        if (depth <= 3 && e != null) {
            depPath += (depPath.isEmpty() ? "" : "_") + e.getRelation().toString();
            features.incrementCount("dep-path=" + depPath);
            w = e.getSource();
        } else {
            w = null;
        }
    }
    if (useConstituencyParse) {
        int fullEmbeddingLevel = headEmbeddingLevel(m.contextParseTree, m.headIndex);
        int mentionEmbeddingLevel = headEmbeddingLevel(m.mentionSubTree, m.headIndex - m.startIndex);
        if (fullEmbeddingLevel != -1 && mentionEmbeddingLevel != -1) {
            features.incrementCount("mention-embedding-level=" + bin(fullEmbeddingLevel - mentionEmbeddingLevel));
            features.incrementCount("head-embedding-level=" + bin(mentionEmbeddingLevel));
        } else {
            features.incrementCount("undetermined-embedding-level");
        }
        features.incrementCount("num-embedded-nps=" + bin(numEmbeddedNps(m.mentionSubTree)));
        String syntaxPath = "";
        Tree tree = m.contextParseTree;
        Tree head = tree.getLeaves().get(m.headIndex).ancestor(1, tree);
        depth = 0;
        for (Tree node : tree.pathNodeToNode(head, tree)) {
            syntaxPath += node.value() + "-";
            features.incrementCount("syntax-path=" + syntaxPath);
            depth++;
            if (depth >= 4 || node.value().equals("S")) {
                break;
            }
        }
    }
    // mention containment features
    addFeature(features, "contained-in-other-mention", mentionsByHeadIndex.get(m.headIndex).stream().anyMatch(m2 -> m != m2 && m.insideIn(m2)));
    addFeature(features, "contains-other-mention", mentionsByHeadIndex.get(m.headIndex).stream().anyMatch(m2 -> m != m2 && m2.insideIn(m)));
    // features from dcoref rules
    addFeature(features, "bare-plural", m.originalSpan.size() == 1 && headPOS.equals("NNS"));
    addFeature(features, "quantifier-start", dictionaries.quantifiers.contains(firstWord.word().toLowerCase()));
    addFeature(features, "negative-start", firstWord.word().toLowerCase().matches("none|no|nothing|not"));
    addFeature(features, "partitive", RuleBasedCorefMentionFinder.partitiveRule(m, m.sentenceWords, dictionaries));
    addFeature(features, "adjectival-demonym", dictionaries.isAdjectivalDemonym(m.spanToString()));
    if (doc.docType != DocType.ARTICLE && m.person == Person.YOU && nextWord != null && nextWord.word().equalsIgnoreCase("know")) {
        features.incrementCount("generic-you");
    }
    return features;
}

Also used : SpeakerAnnotation(edu.stanford.nlp.ling.CoreAnnotations.SpeakerAnnotation) Tree(edu.stanford.nlp.trees.Tree) HashMap(java.util.HashMap) Random(java.util.Random) Dictionaries(edu.stanford.nlp.coref.data.Dictionaries) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Number(edu.stanford.nlp.coref.data.Dictionaries.Number) CorefCluster(edu.stanford.nlp.coref.data.CorefCluster) Mention(edu.stanford.nlp.coref.data.Mention) RuleBasedCorefMentionFinder(edu.stanford.nlp.coref.md.RuleBasedCorefMentionFinder) Counter(edu.stanford.nlp.stats.Counter) Map(java.util.Map) Pair(edu.stanford.nlp.util.Pair) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) CorefRules(edu.stanford.nlp.coref.CorefRules) IndexedWord(edu.stanford.nlp.ling.IndexedWord) CoreLabel(edu.stanford.nlp.ling.CoreLabel) Properties(java.util.Properties) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) SemanticGraphEdge(edu.stanford.nlp.semgraph.SemanticGraphEdge) Iterator(java.util.Iterator) IOUtils(edu.stanford.nlp.io.IOUtils) DocType(edu.stanford.nlp.coref.data.Document.DocType) Set(java.util.Set) Person(edu.stanford.nlp.coref.data.Dictionaries.Person) List(java.util.List) MentionType(edu.stanford.nlp.coref.data.Dictionaries.MentionType) StringUtils(edu.stanford.nlp.util.StringUtils) CorefProperties(edu.stanford.nlp.coref.CorefProperties) Document(edu.stanford.nlp.coref.data.Document) CorefUtils(edu.stanford.nlp.coref.CorefUtils) SemanticGraphEdge(edu.stanford.nlp.semgraph.SemanticGraphEdge) CoreLabel(edu.stanford.nlp.ling.CoreLabel) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) Tree(edu.stanford.nlp.trees.Tree) IndexedWord(edu.stanford.nlp.ling.IndexedWord) HashMap(java.util.HashMap) Map(java.util.Map)

Example 5 with Document

use of edu.stanford.nlp.coref.data.Document in project CoreNLP by stanfordnlp.

the class HybridCorefSystem method runCoref.

public static void runCoref(Properties props) throws Exception {
    /*
    * property, environment setting
    */
    Redwood.hideChannelsEverywhere("debug-cluster", "debug-mention", "debug-preprocessor", "debug-docreader", "debug-mergethres", "debug-featureselection", "debug-md");
    int nThreads = HybridCorefProperties.getThreadCounts(props);
    String timeStamp = Calendar.getInstance().getTime().toString().replaceAll("\\s", "-").replaceAll(":", "-");
    Logger logger = Logger.getLogger(HybridCorefSystem.class.getName());
    // set log file path
    if (props.containsKey(HybridCorefProperties.LOG_PROP)) {
        File logFile = new File(props.getProperty(HybridCorefProperties.LOG_PROP));
        RedwoodConfiguration.current().handlers(RedwoodConfiguration.Handlers.file(logFile)).apply();
        Redwood.log("Starting coref log");
    }
    log.info(props.toString());
    if (HybridCorefProperties.checkMemory(props))
        checkMemoryUsage();
    HybridCorefSystem cs = new HybridCorefSystem(props);
    /*
       output setting
    */
    // prepare conll output
    String goldOutput = null;
    String beforeCorefOutput = null;
    String afterCorefOutput = null;
    PrintWriter writerGold = null;
    PrintWriter writerBeforeCoref = null;
    PrintWriter writerAfterCoref = null;
    if (HybridCorefProperties.doScore(props)) {
        String pathOutput = CorefProperties.conllOutputPath(props);
        (new File(pathOutput)).mkdir();
        goldOutput = pathOutput + "output-" + timeStamp + ".gold.txt";
        beforeCorefOutput = pathOutput + "output-" + timeStamp + ".predicted.txt";
        afterCorefOutput = pathOutput + "output-" + timeStamp + ".coref.predicted.txt";
        writerGold = new PrintWriter(new FileOutputStream(goldOutput));
        writerBeforeCoref = new PrintWriter(new FileOutputStream(beforeCorefOutput));
        writerAfterCoref = new PrintWriter(new FileOutputStream(afterCorefOutput));
    }
    // run coref
    MulticoreWrapper<Pair<Document, HybridCorefSystem>, StringBuilder[]> wrapper = new MulticoreWrapper<>(nThreads, new ThreadsafeProcessor<Pair<Document, HybridCorefSystem>, StringBuilder[]>() {

        @Override
        public StringBuilder[] process(Pair<Document, HybridCorefSystem> input) {
            try {
                Document document = input.first;
                HybridCorefSystem cs = input.second;
                // conll output and logs
                StringBuilder[] outputs = new StringBuilder[4];
                cs.coref(document, outputs);
                return outputs;
            } catch (Exception e) {
                throw new RuntimeException(e);
            }
        }

        @Override
        public ThreadsafeProcessor<Pair<Document, HybridCorefSystem>, StringBuilder[]> newInstance() {
            return this;
        }
    });
    Date startTime = null;
    if (HybridCorefProperties.checkTime(props)) {
        startTime = new Date();
        System.err.printf("END-TO-END COREF Start time: %s\n", startTime);
    }
    // run processes
    int docCnt = 0;
    while (true) {
        Document document = cs.docMaker.nextDoc();
        if (document == null)
            break;
        wrapper.put(Pair.makePair(document, cs));
        docCnt = logOutput(wrapper, writerGold, writerBeforeCoref, writerAfterCoref, docCnt);
    }
    // Finished reading the input. Wait for jobs to finish
    wrapper.join();
    docCnt = logOutput(wrapper, writerGold, writerBeforeCoref, writerAfterCoref, docCnt);
    IOUtils.closeIgnoringExceptions(writerGold);
    IOUtils.closeIgnoringExceptions(writerBeforeCoref);
    IOUtils.closeIgnoringExceptions(writerAfterCoref);
    if (HybridCorefProperties.checkTime(props)) {
        System.err.printf("END-TO-END COREF Elapsed time: %.3f seconds\n", (((new Date()).getTime() - startTime.getTime()) / 1000F));
    // System.err.printf("CORENLP PROCESS TIME TOTAL: %.3f seconds\n", cs.mentionExtractor.corenlpProcessTime);
    }
    if (HybridCorefProperties.checkMemory(props))
        checkMemoryUsage();
    // scoring
    if (HybridCorefProperties.doScore(props)) {
        String summary = CorefScorer.getEvalSummary(CorefProperties.getScorerPath(props), goldOutput, beforeCorefOutput);
        CorefScorer.printScoreSummary(summary, logger, false);
        summary = CorefScorer.getEvalSummary(CorefProperties.getScorerPath(props), goldOutput, afterCorefOutput);
        CorefScorer.printScoreSummary(summary, logger, true);
        CorefScorer.printFinalConllScore(summary, logger);
    }
}

Also used : ThreadsafeProcessor(edu.stanford.nlp.util.concurrent.ThreadsafeProcessor) MulticoreWrapper(edu.stanford.nlp.util.concurrent.MulticoreWrapper) Logger(java.util.logging.Logger) Document(edu.stanford.nlp.coref.data.Document) Date(java.util.Date) FileOutputStream(java.io.FileOutputStream) File(java.io.File) PrintWriter(java.io.PrintWriter) Pair(edu.stanford.nlp.util.Pair)

Aggregations

Document (edu.stanford.nlp.coref.data.Document)8 CorefCluster (edu.stanford.nlp.coref.data.CorefCluster)4 CorefChain (edu.stanford.nlp.coref.data.CorefChain)3 Dictionaries (edu.stanford.nlp.coref.data.Dictionaries)3 Mention (edu.stanford.nlp.coref.data.Mention)3 IOException (java.io.IOException)3 ArrayList (java.util.ArrayList)3 Properties (java.util.Properties)3 DocumentMaker (edu.stanford.nlp.coref.data.DocumentMaker)2 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)2 CoreLabel (edu.stanford.nlp.ling.CoreLabel)2 IndexedWord (edu.stanford.nlp.ling.IndexedWord)2 ClassicCounter (edu.stanford.nlp.stats.ClassicCounter)2 Pair (edu.stanford.nlp.util.Pair)2 StringUtils (edu.stanford.nlp.util.StringUtils)2 File (java.io.File)2 FileOutputStream (java.io.FileOutputStream)2 PrintWriter (java.io.PrintWriter)2 Map (java.util.Map)2 Logger (java.util.logging.Logger)2