Search in sources :

Example 26 with CoreMap

use of edu.stanford.nlp.util.CoreMap in project CoreNLP by stanfordnlp.

the class WordsToSentencesAnnotator method annotate.

/**
   * If setCountLineNumbers is set to true, we count line numbers by
   * telling the underlying splitter to return empty lists of tokens
   * and then treating those empty lists as empty lines.  We don't
   * actually include empty sentences in the annotation, though.
   **/
@Override
public void annotate(Annotation annotation) {
    if (VERBOSE) {
        log.info("Sentence splitting ...");
    }
    if (!annotation.containsKey(CoreAnnotations.TokensAnnotation.class)) {
        throw new IllegalArgumentException("WordsToSentencesAnnotator: unable to find words/tokens in: " + annotation);
    }
    // get text and tokens from the document
    String text = annotation.get(CoreAnnotations.TextAnnotation.class);
    List<CoreLabel> tokens = annotation.get(CoreAnnotations.TokensAnnotation.class);
    String docID = annotation.get(CoreAnnotations.DocIDAnnotation.class);
    // log.info("Tokens are: " + tokens);
    // assemble the sentence annotations
    int tokenOffset = 0;
    int lineNumber = 0;
    // section annotations to mark sentences with
    CoreMap sectionAnnotations = null;
    List<CoreMap> sentences = new ArrayList<>();
    for (List<CoreLabel> sentenceTokens : wts.process(tokens)) {
        if (countLineNumbers) {
            ++lineNumber;
        }
        if (sentenceTokens.isEmpty()) {
            if (!countLineNumbers) {
                throw new IllegalStateException("unexpected empty sentence: " + sentenceTokens);
            } else {
                continue;
            }
        }
        // get the sentence text from the first and last character offsets
        int begin = sentenceTokens.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
        int last = sentenceTokens.size() - 1;
        int end = sentenceTokens.get(last).get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
        String sentenceText = text.substring(begin, end);
        // create a sentence annotation with text and token offsets
        Annotation sentence = new Annotation(sentenceText);
        sentence.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, begin);
        sentence.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, end);
        sentence.set(CoreAnnotations.TokensAnnotation.class, sentenceTokens);
        sentence.set(CoreAnnotations.TokenBeginAnnotation.class, tokenOffset);
        tokenOffset += sentenceTokens.size();
        sentence.set(CoreAnnotations.TokenEndAnnotation.class, tokenOffset);
        sentence.set(CoreAnnotations.SentenceIndexAnnotation.class, sentences.size());
        if (countLineNumbers) {
            sentence.set(CoreAnnotations.LineNumberAnnotation.class, lineNumber);
        }
        // Annotate sentence with section information.
        // Assume section start and end appear as first and last tokens of sentence
        CoreLabel sentenceStartToken = sentenceTokens.get(0);
        CoreLabel sentenceEndToken = sentenceTokens.get(sentenceTokens.size() - 1);
        CoreMap sectionStart = sentenceStartToken.get(CoreAnnotations.SectionStartAnnotation.class);
        if (sectionStart != null) {
            // Section is started
            sectionAnnotations = sectionStart;
        }
        if (sectionAnnotations != null) {
            // transfer annotations over to sentence
            ChunkAnnotationUtils.copyUnsetAnnotations(sectionAnnotations, sentence);
        }
        String sectionEnd = sentenceEndToken.get(CoreAnnotations.SectionEndAnnotation.class);
        if (sectionEnd != null) {
            sectionAnnotations = null;
        }
        if (docID != null) {
            sentence.set(CoreAnnotations.DocIDAnnotation.class, docID);
        }
        int index = 1;
        for (CoreLabel token : sentenceTokens) {
            token.setIndex(index++);
            token.setSentIndex(sentences.size());
            if (docID != null) {
                token.setDocID(docID);
            }
        }
        // add the sentence to the list
        sentences.add(sentence);
    }
    // the condition below is possible if sentenceBoundaryToDiscard is initialized!
    /*
      if (tokenOffset != tokens.size()) {
        throw new RuntimeException(String.format(
            "expected %d tokens, found %d", tokens.size(), tokenOffset));
      }
      */
    // add the sentences annotations to the document
    annotation.set(CoreAnnotations.SentencesAnnotation.class, sentences);
}
Also used : CoreLabel(edu.stanford.nlp.ling.CoreLabel) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) CoreMap(edu.stanford.nlp.util.CoreMap) CoreAnnotation(edu.stanford.nlp.ling.CoreAnnotation)

Example 27 with CoreMap

use of edu.stanford.nlp.util.CoreMap in project CoreNLP by stanfordnlp.

the class XMLOutputter method annotationToDoc.

/**
   * Converts the given annotation to an XML document using the specified options
   */
public static Document annotationToDoc(Annotation annotation, Options options) {
    //
    // create the XML document with the root node pointing to the namespace URL
    //
    Element root = new Element("root", NAMESPACE_URI);
    Document xmlDoc = new Document(root);
    ProcessingInstruction pi = new ProcessingInstruction("xml-stylesheet", "href=\"" + STYLESHEET_NAME + "\" type=\"text/xsl\"");
    xmlDoc.insertChild(pi, 0);
    Element docElem = new Element("document", NAMESPACE_URI);
    root.appendChild(docElem);
    setSingleElement(docElem, "docId", NAMESPACE_URI, annotation.get(CoreAnnotations.DocIDAnnotation.class));
    setSingleElement(docElem, "docDate", NAMESPACE_URI, annotation.get(CoreAnnotations.DocDateAnnotation.class));
    setSingleElement(docElem, "docSourceType", NAMESPACE_URI, annotation.get(CoreAnnotations.DocSourceTypeAnnotation.class));
    setSingleElement(docElem, "docType", NAMESPACE_URI, annotation.get(CoreAnnotations.DocTypeAnnotation.class));
    setSingleElement(docElem, "author", NAMESPACE_URI, annotation.get(CoreAnnotations.AuthorAnnotation.class));
    setSingleElement(docElem, "location", NAMESPACE_URI, annotation.get(CoreAnnotations.LocationAnnotation.class));
    if (options.includeText) {
        setSingleElement(docElem, "text", NAMESPACE_URI, annotation.get(CoreAnnotations.TextAnnotation.class));
    }
    Element sentencesElem = new Element("sentences", NAMESPACE_URI);
    docElem.appendChild(sentencesElem);
    //
    if (annotation.get(CoreAnnotations.SentencesAnnotation.class) != null) {
        int sentCount = 1;
        for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
            Element sentElem = new Element("sentence", NAMESPACE_URI);
            sentElem.addAttribute(new Attribute("id", Integer.toString(sentCount)));
            Integer lineNumber = sentence.get(CoreAnnotations.LineNumberAnnotation.class);
            if (lineNumber != null) {
                sentElem.addAttribute(new Attribute("line", Integer.toString(lineNumber)));
            }
            sentCount++;
            // add the word table with all token-level annotations
            Element wordTable = new Element("tokens", NAMESPACE_URI);
            List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
            for (int j = 0; j < tokens.size(); j++) {
                Element wordInfo = new Element("token", NAMESPACE_URI);
                addWordInfo(wordInfo, tokens.get(j), j + 1, NAMESPACE_URI);
                wordTable.appendChild(wordInfo);
            }
            sentElem.appendChild(wordTable);
            // add tree info
            Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class);
            if (tree != null) {
                // add the constituent tree for this sentence
                Element parseInfo = new Element("parse", NAMESPACE_URI);
                addConstituentTreeInfo(parseInfo, tree, options.constituentTreePrinter);
                sentElem.appendChild(parseInfo);
            }
            SemanticGraph basicDependencies = sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
            if (basicDependencies != null) {
                // add the dependencies for this sentence
                Element depInfo = buildDependencyTreeInfo("basic-dependencies", sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class), tokens, NAMESPACE_URI);
                if (depInfo != null) {
                    sentElem.appendChild(depInfo);
                }
                depInfo = buildDependencyTreeInfo("collapsed-dependencies", sentence.get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class), tokens, NAMESPACE_URI);
                if (depInfo != null) {
                    sentElem.appendChild(depInfo);
                }
                depInfo = buildDependencyTreeInfo("collapsed-ccprocessed-dependencies", sentence.get(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class), tokens, NAMESPACE_URI);
                if (depInfo != null) {
                    sentElem.appendChild(depInfo);
                }
                depInfo = buildDependencyTreeInfo("enhanced-dependencies", sentence.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class), tokens, NAMESPACE_URI);
                if (depInfo != null) {
                    sentElem.appendChild(depInfo);
                }
                depInfo = buildDependencyTreeInfo("enhanced-plus-plus-dependencies", sentence.get(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation.class), tokens, NAMESPACE_URI);
                if (depInfo != null) {
                    sentElem.appendChild(depInfo);
                }
            }
            // add Open IE triples
            Collection<RelationTriple> openieTriples = sentence.get(NaturalLogicAnnotations.RelationTriplesAnnotation.class);
            if (openieTriples != null) {
                Element openieElem = new Element("openie", NAMESPACE_URI);
                addTriples(openieTriples, openieElem, NAMESPACE_URI);
                sentElem.appendChild(openieElem);
            }
            // add KBP triples
            Collection<RelationTriple> kbpTriples = sentence.get(CoreAnnotations.KBPTriplesAnnotation.class);
            if (kbpTriples != null) {
                Element kbpElem = new Element("kbp", NAMESPACE_URI);
                addTriples(kbpTriples, kbpElem, NAMESPACE_URI);
                sentElem.appendChild(kbpElem);
            }
            // add the MR entities and relations
            List<EntityMention> entities = sentence.get(MachineReadingAnnotations.EntityMentionsAnnotation.class);
            List<RelationMention> relations = sentence.get(MachineReadingAnnotations.RelationMentionsAnnotation.class);
            if (entities != null && !entities.isEmpty()) {
                Element mrElem = new Element("MachineReading", NAMESPACE_URI);
                Element entElem = new Element("entities", NAMESPACE_URI);
                addEntities(entities, entElem, NAMESPACE_URI);
                mrElem.appendChild(entElem);
                if (relations != null) {
                    Element relElem = new Element("relations", NAMESPACE_URI);
                    addRelations(relations, relElem, NAMESPACE_URI, options.relationsBeam);
                    mrElem.appendChild(relElem);
                }
                sentElem.appendChild(mrElem);
            }
            /**
         * Adds sentiment as an attribute of this sentence.
         */
            Tree sentimentTree = sentence.get(SentimentCoreAnnotations.SentimentAnnotatedTree.class);
            if (sentimentTree != null) {
                int sentiment = RNNCoreAnnotations.getPredictedClass(sentimentTree);
                sentElem.addAttribute(new Attribute("sentimentValue", Integer.toString(sentiment)));
                String sentimentClass = sentence.get(SentimentCoreAnnotations.SentimentClass.class);
                sentElem.addAttribute(new Attribute("sentiment", sentimentClass.replaceAll(" ", "")));
            }
            // add the sentence to the root
            sentencesElem.appendChild(sentElem);
        }
    }
    //
    // add the coref graph
    //
    Map<Integer, CorefChain> corefChains = annotation.get(CorefCoreAnnotations.CorefChainAnnotation.class);
    if (corefChains != null) {
        List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
        Element corefInfo = new Element("coreference", NAMESPACE_URI);
        if (addCorefGraphInfo(options, corefInfo, sentences, corefChains, NAMESPACE_URI))
            docElem.appendChild(corefInfo);
    }
    return xmlDoc;
}
Also used : RelationMention(edu.stanford.nlp.ie.machinereading.structure.RelationMention) CorefCoreAnnotations(edu.stanford.nlp.coref.CorefCoreAnnotations) MachineReadingAnnotations(edu.stanford.nlp.ie.machinereading.structure.MachineReadingAnnotations) EntityMention(edu.stanford.nlp.ie.machinereading.structure.EntityMention) CorefChain(edu.stanford.nlp.coref.data.CorefChain) RelationTriple(edu.stanford.nlp.ie.util.RelationTriple) Tree(edu.stanford.nlp.trees.Tree) NaturalLogicAnnotations(edu.stanford.nlp.naturalli.NaturalLogicAnnotations) SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) TreeCoreAnnotations(edu.stanford.nlp.trees.TreeCoreAnnotations) TreePrint(edu.stanford.nlp.trees.TreePrint) SentimentCoreAnnotations(edu.stanford.nlp.sentiment.SentimentCoreAnnotations) CoreLabel(edu.stanford.nlp.ling.CoreLabel) SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) RNNCoreAnnotations(edu.stanford.nlp.neural.rnn.RNNCoreAnnotations) TreeCoreAnnotations(edu.stanford.nlp.trees.TreeCoreAnnotations) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) SentimentCoreAnnotations(edu.stanford.nlp.sentiment.SentimentCoreAnnotations) CorefCoreAnnotations(edu.stanford.nlp.coref.CorefCoreAnnotations) SemanticGraph(edu.stanford.nlp.semgraph.SemanticGraph) CoreMap(edu.stanford.nlp.util.CoreMap)

Example 28 with CoreMap

use of edu.stanford.nlp.util.CoreMap in project CoreNLP by stanfordnlp.

the class WordToSentenceProcessor method wordsToSentences.

/**
   * Returns a List of Lists where each element is built from a run
   * of Words in the input Document. Specifically, reads through each word in
   * the input document and breaks off a sentence after finding a valid
   * sentence boundary token or end of file.
   * Note that for this to work, the words in the
   * input document must have been tokenized with a tokenizer that makes
   * sentence boundary tokens their own tokens (e.g., {@link PTBTokenizer}).
   *
   * @param words A list of already tokenized words (must implement HasWord or be a String).
   * @return A list of sentences.
   * @see #WordToSentenceProcessor(String, String, Set, Set, String, NewlineIsSentenceBreak, SequencePattern, Set, boolean, boolean)
   */
public List<List<IN>> wordsToSentences(List<? extends IN> words) {
    // is null unless used by sentenceBoundaryMultiTokenPattern
    IdentityHashMap<Object, Boolean> isSentenceBoundary = null;
    if (sentenceBoundaryMultiTokenPattern != null) {
        // Do initial pass using tokensregex to identify multi token patterns that need to be matched
        // and add the last token to our table of sentence boundary tokens
        isSentenceBoundary = new IdentityHashMap<>();
        SequenceMatcher<? super IN> matcher = sentenceBoundaryMultiTokenPattern.getMatcher(words);
        while (matcher.find()) {
            List nodes = matcher.groupNodes();
            if (nodes != null && !nodes.isEmpty()) {
                isSentenceBoundary.put(nodes.get(nodes.size() - 1), true);
            }
        }
    }
    // Split tokens into sentences!!!
    List<List<IN>> sentences = Generics.newArrayList();
    List<IN> currentSentence = new ArrayList<>();
    List<IN> lastSentence = null;
    boolean insideRegion = false;
    boolean inWaitForForcedEnd = false;
    boolean lastTokenWasNewline = false;
    for (IN o : words) {
        String word = getString(o);
        boolean forcedEnd = isForcedEndToken(o);
        boolean inMultiTokenExpr = false;
        boolean discardToken = false;
        if (o instanceof CoreMap) {
            // Hacky stuff to ensure sentence breaks do not happen in certain cases
            CoreMap cm = (CoreMap) o;
            Boolean forcedUntilEndValue = cm.get(CoreAnnotations.ForcedSentenceUntilEndAnnotation.class);
            if (!forcedEnd) {
                if (forcedUntilEndValue != null && forcedUntilEndValue)
                    inWaitForForcedEnd = true;
                else {
                    MultiTokenTag mt = cm.get(CoreAnnotations.MentionTokenAnnotation.class);
                    if (mt != null && !mt.isEnd()) {
                        // In the middle of a multi token mention, make sure sentence is not ended here
                        inMultiTokenExpr = true;
                    }
                }
            }
        }
        if (tokenPatternsToDiscard != null) {
            discardToken = matchesTokenPatternsToDiscard(word);
        }
        if (sentenceRegionBeginPattern != null && !insideRegion) {
            if (DEBUG) {
                log.info("Word is " + word + "; outside region; deleted");
            }
            if (sentenceRegionBeginPattern.matcher(word).matches()) {
                insideRegion = true;
                if (DEBUG) {
                    log.info("  entering region");
                }
            }
            lastTokenWasNewline = false;
            continue;
        }
        if (lastSentence != null && currentSentence.isEmpty() && sentenceBoundaryFollowersPattern.matcher(word).matches()) {
            if (!discardToken) {
                lastSentence.add(o);
            }
            if (DEBUG) {
                log.info("Word is " + word + (discardToken ? "discarded" : "  added to last sentence"));
            }
            lastTokenWasNewline = false;
            continue;
        }
        boolean newSent = false;
        String debugText = (discardToken) ? "discarded" : "added to current";
        if (inWaitForForcedEnd && !forcedEnd) {
            if (!discardToken)
                currentSentence.add(o);
            if (DEBUG) {
                log.info("Word is " + word + "; is in wait for forced end; " + debugText);
            }
        } else if (inMultiTokenExpr && !forcedEnd) {
            if (!discardToken)
                currentSentence.add(o);
            if (DEBUG) {
                log.info("Word is " + word + "; is in multi token expr; " + debugText);
            }
        } else if (sentenceBoundaryToDiscard.contains(word)) {
            if (newlineIsSentenceBreak == NewlineIsSentenceBreak.ALWAYS) {
                newSent = true;
            } else if (newlineIsSentenceBreak == NewlineIsSentenceBreak.TWO_CONSECUTIVE) {
                if (lastTokenWasNewline) {
                    newSent = true;
                }
            }
            lastTokenWasNewline = true;
            if (DEBUG) {
                log.info("Word is " + word + "  discarded sentence boundary");
            }
        } else {
            lastTokenWasNewline = false;
            Boolean isb;
            if (xmlBreakElementsToDiscard != null && matchesXmlBreakElementToDiscard(word)) {
                newSent = true;
                if (DEBUG) {
                    log.info("Word is " + word + "; is XML break element; discarded");
                }
            } else if (sentenceRegionEndPattern != null && sentenceRegionEndPattern.matcher(word).matches()) {
                insideRegion = false;
                newSent = true;
            // Marked sentence boundaries
            } else if ((isSentenceBoundary != null) && ((isb = isSentenceBoundary.get(o)) != null) && isb) {
                if (!discardToken)
                    currentSentence.add(o);
                if (DEBUG) {
                    log.info("Word is " + word + "; is sentence boundary (matched multi-token pattern); " + debugText);
                }
                newSent = true;
            } else if (sentenceBoundaryTokenPattern.matcher(word).matches()) {
                if (!discardToken)
                    currentSentence.add(o);
                if (DEBUG) {
                    log.info("Word is " + word + "; is sentence boundary; " + debugText);
                }
                newSent = true;
            } else if (forcedEnd) {
                if (!discardToken)
                    currentSentence.add(o);
                inWaitForForcedEnd = false;
                newSent = true;
                if (DEBUG) {
                    log.info("Word is " + word + "; annotated to be the end of a sentence; " + debugText);
                }
            } else {
                if (!discardToken)
                    currentSentence.add(o);
                if (DEBUG) {
                    log.info("Word is " + word + "; " + debugText);
                }
            }
        }
        if (newSent && (!currentSentence.isEmpty() || allowEmptySentences)) {
            if (DEBUG) {
                log.info("  beginning new sentence");
            }
            sentences.add(currentSentence);
            // adds this sentence now that it's complete
            lastSentence = currentSentence;
            // clears the current sentence
            currentSentence = new ArrayList<>();
        }
    }
    // terminator at the end of file
    if (!currentSentence.isEmpty()) {
        // adds last sentence
        sentences.add(currentSentence);
    }
    return sentences;
}
Also used : MultiTokenTag(edu.stanford.nlp.ling.MultiTokenTag) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) CoreMap(edu.stanford.nlp.util.CoreMap)

Example 29 with CoreMap

use of edu.stanford.nlp.util.CoreMap in project CoreNLP by stanfordnlp.

the class TrueCaseAnnotator method annotate.

@Override
public void annotate(Annotation annotation) {
    if (verbose) {
        log.info("Adding true-case annotation...");
    }
    if (annotation.containsKey(CoreAnnotations.SentencesAnnotation.class)) {
        // classify tokens for each sentence
        for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
            List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
            List<CoreLabel> output = this.trueCaser.classifySentence(tokens);
            for (int i = 0, size = tokens.size(); i < size; i++) {
                // add the truecaser tag to each token
                String neTag = output.get(i).get(CoreAnnotations.AnswerAnnotation.class);
                tokens.get(i).set(CoreAnnotations.TrueCaseAnnotation.class, neTag);
                setTrueCaseText(tokens.get(i));
            }
        }
    } else {
        throw new RuntimeException("unable to find sentences in: " + annotation);
    }
}
Also used : CoreLabel(edu.stanford.nlp.ling.CoreLabel) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) CoreMap(edu.stanford.nlp.util.CoreMap)

Example 30 with CoreMap

use of edu.stanford.nlp.util.CoreMap in project CoreNLP by stanfordnlp.

the class JSONOutputter method print.

/** {@inheritDoc} */
// It's lying; we need the "redundant" casts (as of 2014-09-08)
@SuppressWarnings("RedundantCast")
@Override
public void print(Annotation doc, OutputStream target, Options options) throws IOException {
    PrintWriter writer = new PrintWriter(IOUtils.encodedOutputStreamWriter(target, options.encoding));
    JSONWriter l0 = new JSONWriter(writer, options);
    l0.object(l1 -> {
        l1.set("docId", doc.get(CoreAnnotations.DocIDAnnotation.class));
        l1.set("docDate", doc.get(CoreAnnotations.DocDateAnnotation.class));
        l1.set("docSourceType", doc.get(CoreAnnotations.DocSourceTypeAnnotation.class));
        l1.set("docType", doc.get(CoreAnnotations.DocTypeAnnotation.class));
        l1.set("author", doc.get(CoreAnnotations.AuthorAnnotation.class));
        l1.set("location", doc.get(CoreAnnotations.LocationAnnotation.class));
        if (options.includeText) {
            l1.set("text", doc.get(CoreAnnotations.TextAnnotation.class));
        }
        if (doc.get(CoreAnnotations.SentencesAnnotation.class) != null) {
            l1.set("sentences", doc.get(CoreAnnotations.SentencesAnnotation.class).stream().map(sentence -> (Consumer<Writer>) (Writer l2) -> {
                l2.set("id", sentence.get(CoreAnnotations.SentenceIDAnnotation.class));
                l2.set("index", sentence.get(CoreAnnotations.SentenceIndexAnnotation.class));
                l2.set("line", sentence.get(CoreAnnotations.LineNumberAnnotation.class));
                StringWriter treeStrWriter = new StringWriter();
                TreePrint treePrinter = options.constituentTreePrinter;
                if (treePrinter == AnnotationOutputter.DEFAULT_CONSTITUENT_TREE_PRINTER) {
                    treePrinter = new TreePrint("oneline");
                }
                treePrinter.printTree(sentence.get(TreeCoreAnnotations.TreeAnnotation.class), new PrintWriter(treeStrWriter, true));
                String treeStr = treeStrWriter.toString().trim();
                if (!"SENTENCE_SKIPPED_OR_UNPARSABLE".equals(treeStr)) {
                    l2.set("parse", treeStr);
                }
                l2.set("basicDependencies", buildDependencyTree(sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class)));
                l2.set("enhancedDependencies", buildDependencyTree(sentence.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class)));
                l2.set("enhancedPlusPlusDependencies", buildDependencyTree(sentence.get(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation.class)));
                Tree sentimentTree = sentence.get(SentimentCoreAnnotations.SentimentAnnotatedTree.class);
                if (sentimentTree != null) {
                    int sentiment = RNNCoreAnnotations.getPredictedClass(sentimentTree);
                    String sentimentClass = sentence.get(SentimentCoreAnnotations.SentimentClass.class);
                    l2.set("sentimentValue", Integer.toString(sentiment));
                    l2.set("sentiment", sentimentClass.replaceAll(" ", ""));
                }
                Collection<RelationTriple> openIETriples = sentence.get(NaturalLogicAnnotations.RelationTriplesAnnotation.class);
                if (openIETriples != null) {
                    l2.set("openie", openIETriples.stream().map(triple -> (Consumer<Writer>) (Writer tripleWriter) -> {
                        tripleWriter.set("subject", triple.subjectGloss());
                        tripleWriter.set("subjectSpan", Span.fromPair(triple.subjectTokenSpan()));
                        tripleWriter.set("relation", triple.relationGloss());
                        tripleWriter.set("relationSpan", Span.fromPair(triple.relationTokenSpan()));
                        tripleWriter.set("object", triple.objectGloss());
                        tripleWriter.set("objectSpan", Span.fromPair(triple.objectTokenSpan()));
                    }));
                }
                Collection<RelationTriple> kbpTriples = sentence.get(CoreAnnotations.KBPTriplesAnnotation.class);
                if (kbpTriples != null) {
                    l2.set("kbp", kbpTriples.stream().map(triple -> (Consumer<Writer>) (Writer tripleWriter) -> {
                        tripleWriter.set("subject", triple.subjectGloss());
                        tripleWriter.set("subjectSpan", Span.fromPair(triple.subjectTokenSpan()));
                        tripleWriter.set("relation", triple.relationGloss());
                        tripleWriter.set("relationSpan", Span.fromPair(triple.relationTokenSpan()));
                        tripleWriter.set("object", triple.objectGloss());
                        tripleWriter.set("objectSpan", Span.fromPair(triple.objectTokenSpan()));
                    }));
                }
                if (sentence.get(CoreAnnotations.MentionsAnnotation.class) != null) {
                    Integer sentTokenBegin = sentence.get(CoreAnnotations.TokenBeginAnnotation.class);
                    l2.set("entitymentions", sentence.get(CoreAnnotations.MentionsAnnotation.class).stream().map(m -> (Consumer<Writer>) (Writer l3) -> {
                        Integer tokenBegin = m.get(CoreAnnotations.TokenBeginAnnotation.class);
                        Integer tokenEnd = m.get(CoreAnnotations.TokenEndAnnotation.class);
                        l3.set("docTokenBegin", tokenBegin);
                        l3.set("docTokenEnd", tokenEnd);
                        if (tokenBegin != null && sentTokenBegin != null) {
                            l3.set("tokenBegin", tokenBegin - sentTokenBegin);
                        }
                        if (tokenEnd != null && sentTokenBegin != null) {
                            l3.set("tokenEnd", tokenEnd - sentTokenBegin);
                        }
                        l3.set("text", m.get(CoreAnnotations.TextAnnotation.class));
                        l3.set("characterOffsetBegin", m.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class));
                        l3.set("characterOffsetEnd", m.get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
                        l3.set("ner", m.get(CoreAnnotations.NamedEntityTagAnnotation.class));
                        l3.set("normalizedNER", m.get(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class));
                        l3.set("entitylink", m.get(CoreAnnotations.WikipediaEntityAnnotation.class));
                        Timex time = m.get(TimeAnnotations.TimexAnnotation.class);
                        if (time != null) {
                            Timex.Range range = time.range();
                            l3.set("timex", (Consumer<Writer>) l4 -> {
                                l4.set("tid", time.tid());
                                l4.set("type", time.timexType());
                                l4.set("value", time.value());
                                l4.set("altValue", time.altVal());
                                l4.set("range", (range != null) ? (Consumer<Writer>) l5 -> {
                                    l5.set("begin", range.begin);
                                    l5.set("end", range.end);
                                    l5.set("duration", range.duration);
                                } : null);
                            });
                        }
                    }));
                }
                if (sentence.get(CoreAnnotations.TokensAnnotation.class) != null) {
                    l2.set("tokens", sentence.get(CoreAnnotations.TokensAnnotation.class).stream().map(token -> (Consumer<Writer>) (Writer l3) -> {
                        l3.set("index", token.index());
                        l3.set("word", token.word());
                        l3.set("originalText", token.originalText());
                        l3.set("lemma", token.lemma());
                        l3.set("characterOffsetBegin", token.beginPosition());
                        l3.set("characterOffsetEnd", token.endPosition());
                        l3.set("pos", token.tag());
                        l3.set("ner", token.ner());
                        l3.set("normalizedNER", token.get(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class));
                        l3.set("speaker", token.get(CoreAnnotations.SpeakerAnnotation.class));
                        l3.set("truecase", token.get(CoreAnnotations.TrueCaseAnnotation.class));
                        l3.set("truecaseText", token.get(CoreAnnotations.TrueCaseTextAnnotation.class));
                        l3.set("before", token.get(CoreAnnotations.BeforeAnnotation.class));
                        l3.set("after", token.get(CoreAnnotations.AfterAnnotation.class));
                        l3.set("entitylink", token.get(CoreAnnotations.WikipediaEntityAnnotation.class));
                        Timex time = token.get(TimeAnnotations.TimexAnnotation.class);
                        if (time != null) {
                            Timex.Range range = time.range();
                            l3.set("timex", (Consumer<Writer>) l4 -> {
                                l4.set("tid", time.tid());
                                l4.set("type", time.timexType());
                                l4.set("value", time.value());
                                l4.set("altValue", time.altVal());
                                l4.set("range", (range != null) ? (Consumer<Writer>) l5 -> {
                                    l5.set("begin", range.begin);
                                    l5.set("end", range.end);
                                    l5.set("duration", range.duration);
                                } : null);
                            });
                        }
                    }));
                }
            }));
        }
        if (doc.get(CorefCoreAnnotations.CorefChainAnnotation.class) != null) {
            Map<Integer, CorefChain> corefChains = doc.get(CorefCoreAnnotations.CorefChainAnnotation.class);
            if (corefChains != null) {
                l1.set("corefs", (Consumer<Writer>) chainWriter -> {
                    for (CorefChain chain : corefChains.values()) {
                        CorefChain.CorefMention representative = chain.getRepresentativeMention();
                        chainWriter.set(Integer.toString(chain.getChainID()), chain.getMentionsInTextualOrder().stream().map(mention -> (Consumer<Writer>) (Writer mentionWriter) -> {
                            mentionWriter.set("id", mention.mentionID);
                            mentionWriter.set("text", mention.mentionSpan);
                            mentionWriter.set("type", mention.mentionType);
                            mentionWriter.set("number", mention.number);
                            mentionWriter.set("gender", mention.gender);
                            mentionWriter.set("animacy", mention.animacy);
                            mentionWriter.set("startIndex", mention.startIndex);
                            mentionWriter.set("endIndex", mention.endIndex);
                            mentionWriter.set("headIndex", mention.headIndex);
                            mentionWriter.set("sentNum", mention.sentNum);
                            mentionWriter.set("position", Arrays.stream(mention.position.elems()).boxed().collect(Collectors.toList()));
                            mentionWriter.set("isRepresentativeMention", mention == representative);
                        }));
                    }
                });
            }
        }
        if (doc.get(CoreAnnotations.QuotationsAnnotation.class) != null) {
            List<CoreMap> quotes = QuoteAnnotator.gatherQuotes(doc);
            l1.set("quotes", quotes.stream().map(quote -> (Consumer<Writer>) (Writer l2) -> {
                l2.set("id", quote.get(CoreAnnotations.QuotationIndexAnnotation.class));
                l2.set("text", quote.get(CoreAnnotations.TextAnnotation.class));
                l2.set("beginIndex", quote.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class));
                l2.set("endIndex", quote.get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
                l2.set("beginToken", quote.get(CoreAnnotations.TokenBeginAnnotation.class));
                l2.set("endToken", quote.get(CoreAnnotations.TokenEndAnnotation.class));
                l2.set("beginSentence", quote.get(CoreAnnotations.SentenceBeginAnnotation.class));
                l2.set("endSentence", quote.get(CoreAnnotations.SentenceEndAnnotation.class));
            }));
        }
    });
    // flush
    l0.writer.flush();
}
Also used : java.util(java.util) CorefChain(edu.stanford.nlp.coref.data.CorefChain) SentenceUtils(edu.stanford.nlp.ling.SentenceUtils) Tree(edu.stanford.nlp.trees.Tree) NaturalLogicAnnotations(edu.stanford.nlp.naturalli.NaturalLogicAnnotations) TimeAnnotations(edu.stanford.nlp.time.TimeAnnotations) SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) RNNCoreAnnotations(edu.stanford.nlp.neural.rnn.RNNCoreAnnotations) RelationTriple(edu.stanford.nlp.ie.util.RelationTriple) SemanticGraph(edu.stanford.nlp.semgraph.SemanticGraph) Pair(edu.stanford.nlp.util.Pair) CoreMap(edu.stanford.nlp.util.CoreMap) Timex(edu.stanford.nlp.time.Timex) IndexedWord(edu.stanford.nlp.ling.IndexedWord) TreeCoreAnnotations(edu.stanford.nlp.trees.TreeCoreAnnotations) TreePrint(edu.stanford.nlp.trees.TreePrint) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) SemanticGraphEdge(edu.stanford.nlp.semgraph.SemanticGraphEdge) IOUtils(edu.stanford.nlp.io.IOUtils) Pointer(edu.stanford.nlp.util.Pointer) SentimentCoreAnnotations(edu.stanford.nlp.sentiment.SentimentCoreAnnotations) DecimalFormat(java.text.DecimalFormat) StringOutputStream(edu.stanford.nlp.io.StringOutputStream) Collectors(java.util.stream.Collectors) Span(edu.stanford.nlp.ie.machinereading.structure.Span) Consumer(java.util.function.Consumer) Stream(java.util.stream.Stream) java.io(java.io) Generics(edu.stanford.nlp.util.Generics) CorefCoreAnnotations(edu.stanford.nlp.coref.CorefCoreAnnotations) TreePrint(edu.stanford.nlp.trees.TreePrint) CorefCoreAnnotations(edu.stanford.nlp.coref.CorefCoreAnnotations) Consumer(java.util.function.Consumer) CorefChain(edu.stanford.nlp.coref.data.CorefChain) RelationTriple(edu.stanford.nlp.ie.util.RelationTriple) Tree(edu.stanford.nlp.trees.Tree) TimeAnnotations(edu.stanford.nlp.time.TimeAnnotations) NaturalLogicAnnotations(edu.stanford.nlp.naturalli.NaturalLogicAnnotations) SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) CoreMap(edu.stanford.nlp.util.CoreMap) TreePrint(edu.stanford.nlp.trees.TreePrint) SentimentCoreAnnotations(edu.stanford.nlp.sentiment.SentimentCoreAnnotations) SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) RNNCoreAnnotations(edu.stanford.nlp.neural.rnn.RNNCoreAnnotations) TreeCoreAnnotations(edu.stanford.nlp.trees.TreeCoreAnnotations) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) SentimentCoreAnnotations(edu.stanford.nlp.sentiment.SentimentCoreAnnotations) CorefCoreAnnotations(edu.stanford.nlp.coref.CorefCoreAnnotations) Timex(edu.stanford.nlp.time.Timex)

Aggregations

CoreMap (edu.stanford.nlp.util.CoreMap)251 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)170 CoreLabel (edu.stanford.nlp.ling.CoreLabel)101 SemanticGraphCoreAnnotations (edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations)61 TreeCoreAnnotations (edu.stanford.nlp.trees.TreeCoreAnnotations)53 ArrayList (java.util.ArrayList)52 Annotation (edu.stanford.nlp.pipeline.Annotation)47 Tree (edu.stanford.nlp.trees.Tree)27 Properties (java.util.Properties)22 SemanticGraph (edu.stanford.nlp.semgraph.SemanticGraph)20 List (java.util.List)20 StanfordCoreNLP (edu.stanford.nlp.pipeline.StanfordCoreNLP)19 Mention (edu.stanford.nlp.coref.data.Mention)17 ArrayCoreMap (edu.stanford.nlp.util.ArrayCoreMap)17 CorefCoreAnnotations (edu.stanford.nlp.coref.CorefCoreAnnotations)13 ParserConstraint (edu.stanford.nlp.parser.common.ParserConstraint)12 SentencesAnnotation (edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation)11 MachineReadingAnnotations (edu.stanford.nlp.ie.machinereading.structure.MachineReadingAnnotations)9 IndexedWord (edu.stanford.nlp.ling.IndexedWord)9 IntPair (edu.stanford.nlp.util.IntPair)9