Search in sources :

Example 21 with CoreLabel

use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.

the class SpanishTokenizerITest method testOffsetsSpacing.

public void testOffsetsSpacing() {
    // guide                 1         2         3         4          5         6         7           8         9         0         1         2         3
    // guide       0123456789012345678901234567890123456789012345678 90123456789012345678901234567 8 901234567890123456789012345678901234567890123456789012345
    String text = "  La   combinación consonántica ss es ajena a la\tortografía    castellana:   \n\n traigámosela, mandémoselos, escribámosela, comprémoselo.";
    final TokenizerFactory<CoreLabel> tf = SpanishTokenizer.coreLabelFactory();
    tf.setOptions("");
    tf.setOptions("splitAll=true");
    Tokenizer<CoreLabel> spanishTokenizer = tf.getTokenizer(new StringReader(text));
    List<CoreLabel> tokens = spanishTokenizer.tokenize();
    System.err.println(tokens);
    assertEquals(27, tokens.size());
    // assertEquals("  ", tokens.get(0).get(CoreAnnotations.BeforeAnnotation.class));
    // assertEquals("\t", tokens.get(8).get(CoreAnnotations.AfterAnnotation.class));
    assertEquals("Begin char offset", 2, (int) tokens.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class));
    assertEquals("End char offset", 4, (int) tokens.get(0).get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
    assertEquals("La", tokens.get(0).get(CoreAnnotations.OriginalTextAnnotation.class));
    // note: after(x) and before(x+1) are the same
    // assertEquals("   ", tokens.get(0).get(CoreAnnotations.AfterAnnotation.class));
    // assertEquals("   ", tokens.get(1).get(CoreAnnotations.BeforeAnnotation.class));
    assertEquals("escribámo", tokens.get(19).get(CoreAnnotations.OriginalTextAnnotation.class));
    assertEquals("escribamos", tokens.get(19).get(CoreAnnotations.TextAnnotation.class));
    assertEquals("Begin char offset", 108, (int) tokens.get(19).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class));
    assertEquals("End char offset", 117, (int) tokens.get(19).get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
    assertEquals("se", tokens.get(20).get(CoreAnnotations.OriginalTextAnnotation.class));
    assertEquals("se", tokens.get(20).get(CoreAnnotations.TextAnnotation.class));
    assertEquals("Begin char offset", 117, (int) tokens.get(20).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class));
    assertEquals("End char offset", 119, (int) tokens.get(20).get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
    assertEquals("la", tokens.get(21).get(CoreAnnotations.OriginalTextAnnotation.class));
    assertEquals("la", tokens.get(21).get(CoreAnnotations.TextAnnotation.class));
    assertEquals("Begin char offset", 119, (int) tokens.get(21).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class));
    assertEquals("End char offset", 121, (int) tokens.get(21).get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
    assertEquals(",", tokens.get(22).get(CoreAnnotations.OriginalTextAnnotation.class));
    assertEquals(",", tokens.get(22).get(CoreAnnotations.TextAnnotation.class));
    assertEquals("Begin char offset", 121, (int) tokens.get(22).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class));
    assertEquals("End char offset", 122, (int) tokens.get(22).get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
}
Also used : CoreLabel(edu.stanford.nlp.ling.CoreLabel) StringReader(java.io.StringReader) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations)

Example 22 with CoreLabel

use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.

the class WordsToSentencesAnnotator method annotate.

/**
   * If setCountLineNumbers is set to true, we count line numbers by
   * telling the underlying splitter to return empty lists of tokens
   * and then treating those empty lists as empty lines.  We don't
   * actually include empty sentences in the annotation, though.
   **/
@Override
public void annotate(Annotation annotation) {
    if (VERBOSE) {
        log.info("Sentence splitting ...");
    }
    if (!annotation.containsKey(CoreAnnotations.TokensAnnotation.class)) {
        throw new IllegalArgumentException("WordsToSentencesAnnotator: unable to find words/tokens in: " + annotation);
    }
    // get text and tokens from the document
    String text = annotation.get(CoreAnnotations.TextAnnotation.class);
    List<CoreLabel> tokens = annotation.get(CoreAnnotations.TokensAnnotation.class);
    String docID = annotation.get(CoreAnnotations.DocIDAnnotation.class);
    // log.info("Tokens are: " + tokens);
    // assemble the sentence annotations
    int tokenOffset = 0;
    int lineNumber = 0;
    // section annotations to mark sentences with
    CoreMap sectionAnnotations = null;
    List<CoreMap> sentences = new ArrayList<>();
    for (List<CoreLabel> sentenceTokens : wts.process(tokens)) {
        if (countLineNumbers) {
            ++lineNumber;
        }
        if (sentenceTokens.isEmpty()) {
            if (!countLineNumbers) {
                throw new IllegalStateException("unexpected empty sentence: " + sentenceTokens);
            } else {
                continue;
            }
        }
        // get the sentence text from the first and last character offsets
        int begin = sentenceTokens.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
        int last = sentenceTokens.size() - 1;
        int end = sentenceTokens.get(last).get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
        String sentenceText = text.substring(begin, end);
        // create a sentence annotation with text and token offsets
        Annotation sentence = new Annotation(sentenceText);
        sentence.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, begin);
        sentence.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, end);
        sentence.set(CoreAnnotations.TokensAnnotation.class, sentenceTokens);
        sentence.set(CoreAnnotations.TokenBeginAnnotation.class, tokenOffset);
        tokenOffset += sentenceTokens.size();
        sentence.set(CoreAnnotations.TokenEndAnnotation.class, tokenOffset);
        sentence.set(CoreAnnotations.SentenceIndexAnnotation.class, sentences.size());
        if (countLineNumbers) {
            sentence.set(CoreAnnotations.LineNumberAnnotation.class, lineNumber);
        }
        // Annotate sentence with section information.
        // Assume section start and end appear as first and last tokens of sentence
        CoreLabel sentenceStartToken = sentenceTokens.get(0);
        CoreLabel sentenceEndToken = sentenceTokens.get(sentenceTokens.size() - 1);
        CoreMap sectionStart = sentenceStartToken.get(CoreAnnotations.SectionStartAnnotation.class);
        if (sectionStart != null) {
            // Section is started
            sectionAnnotations = sectionStart;
        }
        if (sectionAnnotations != null) {
            // transfer annotations over to sentence
            ChunkAnnotationUtils.copyUnsetAnnotations(sectionAnnotations, sentence);
        }
        String sectionEnd = sentenceEndToken.get(CoreAnnotations.SectionEndAnnotation.class);
        if (sectionEnd != null) {
            sectionAnnotations = null;
        }
        if (docID != null) {
            sentence.set(CoreAnnotations.DocIDAnnotation.class, docID);
        }
        int index = 1;
        for (CoreLabel token : sentenceTokens) {
            token.setIndex(index++);
            token.setSentIndex(sentences.size());
            if (docID != null) {
                token.setDocID(docID);
            }
        }
        // add the sentence to the list
        sentences.add(sentence);
    }
    // the condition below is possible if sentenceBoundaryToDiscard is initialized!
    /*
      if (tokenOffset != tokens.size()) {
        throw new RuntimeException(String.format(
            "expected %d tokens, found %d", tokens.size(), tokenOffset));
      }
      */
    // add the sentences annotations to the document
    annotation.set(CoreAnnotations.SentencesAnnotation.class, sentences);
}
Also used : CoreLabel(edu.stanford.nlp.ling.CoreLabel) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) CoreMap(edu.stanford.nlp.util.CoreMap) CoreAnnotation(edu.stanford.nlp.ling.CoreAnnotation)

Example 23 with CoreLabel

use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.

the class XMLOutputter method annotationToDoc.

/**
   * Converts the given annotation to an XML document using the specified options
   */
public static Document annotationToDoc(Annotation annotation, Options options) {
    //
    // create the XML document with the root node pointing to the namespace URL
    //
    Element root = new Element("root", NAMESPACE_URI);
    Document xmlDoc = new Document(root);
    ProcessingInstruction pi = new ProcessingInstruction("xml-stylesheet", "href=\"" + STYLESHEET_NAME + "\" type=\"text/xsl\"");
    xmlDoc.insertChild(pi, 0);
    Element docElem = new Element("document", NAMESPACE_URI);
    root.appendChild(docElem);
    setSingleElement(docElem, "docId", NAMESPACE_URI, annotation.get(CoreAnnotations.DocIDAnnotation.class));
    setSingleElement(docElem, "docDate", NAMESPACE_URI, annotation.get(CoreAnnotations.DocDateAnnotation.class));
    setSingleElement(docElem, "docSourceType", NAMESPACE_URI, annotation.get(CoreAnnotations.DocSourceTypeAnnotation.class));
    setSingleElement(docElem, "docType", NAMESPACE_URI, annotation.get(CoreAnnotations.DocTypeAnnotation.class));
    setSingleElement(docElem, "author", NAMESPACE_URI, annotation.get(CoreAnnotations.AuthorAnnotation.class));
    setSingleElement(docElem, "location", NAMESPACE_URI, annotation.get(CoreAnnotations.LocationAnnotation.class));
    if (options.includeText) {
        setSingleElement(docElem, "text", NAMESPACE_URI, annotation.get(CoreAnnotations.TextAnnotation.class));
    }
    Element sentencesElem = new Element("sentences", NAMESPACE_URI);
    docElem.appendChild(sentencesElem);
    //
    if (annotation.get(CoreAnnotations.SentencesAnnotation.class) != null) {
        int sentCount = 1;
        for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
            Element sentElem = new Element("sentence", NAMESPACE_URI);
            sentElem.addAttribute(new Attribute("id", Integer.toString(sentCount)));
            Integer lineNumber = sentence.get(CoreAnnotations.LineNumberAnnotation.class);
            if (lineNumber != null) {
                sentElem.addAttribute(new Attribute("line", Integer.toString(lineNumber)));
            }
            sentCount++;
            // add the word table with all token-level annotations
            Element wordTable = new Element("tokens", NAMESPACE_URI);
            List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
            for (int j = 0; j < tokens.size(); j++) {
                Element wordInfo = new Element("token", NAMESPACE_URI);
                addWordInfo(wordInfo, tokens.get(j), j + 1, NAMESPACE_URI);
                wordTable.appendChild(wordInfo);
            }
            sentElem.appendChild(wordTable);
            // add tree info
            Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class);
            if (tree != null) {
                // add the constituent tree for this sentence
                Element parseInfo = new Element("parse", NAMESPACE_URI);
                addConstituentTreeInfo(parseInfo, tree, options.constituentTreePrinter);
                sentElem.appendChild(parseInfo);
            }
            SemanticGraph basicDependencies = sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
            if (basicDependencies != null) {
                // add the dependencies for this sentence
                Element depInfo = buildDependencyTreeInfo("basic-dependencies", sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class), tokens, NAMESPACE_URI);
                if (depInfo != null) {
                    sentElem.appendChild(depInfo);
                }
                depInfo = buildDependencyTreeInfo("collapsed-dependencies", sentence.get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class), tokens, NAMESPACE_URI);
                if (depInfo != null) {
                    sentElem.appendChild(depInfo);
                }
                depInfo = buildDependencyTreeInfo("collapsed-ccprocessed-dependencies", sentence.get(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class), tokens, NAMESPACE_URI);
                if (depInfo != null) {
                    sentElem.appendChild(depInfo);
                }
                depInfo = buildDependencyTreeInfo("enhanced-dependencies", sentence.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class), tokens, NAMESPACE_URI);
                if (depInfo != null) {
                    sentElem.appendChild(depInfo);
                }
                depInfo = buildDependencyTreeInfo("enhanced-plus-plus-dependencies", sentence.get(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation.class), tokens, NAMESPACE_URI);
                if (depInfo != null) {
                    sentElem.appendChild(depInfo);
                }
            }
            // add Open IE triples
            Collection<RelationTriple> openieTriples = sentence.get(NaturalLogicAnnotations.RelationTriplesAnnotation.class);
            if (openieTriples != null) {
                Element openieElem = new Element("openie", NAMESPACE_URI);
                addTriples(openieTriples, openieElem, NAMESPACE_URI);
                sentElem.appendChild(openieElem);
            }
            // add KBP triples
            Collection<RelationTriple> kbpTriples = sentence.get(CoreAnnotations.KBPTriplesAnnotation.class);
            if (kbpTriples != null) {
                Element kbpElem = new Element("kbp", NAMESPACE_URI);
                addTriples(kbpTriples, kbpElem, NAMESPACE_URI);
                sentElem.appendChild(kbpElem);
            }
            // add the MR entities and relations
            List<EntityMention> entities = sentence.get(MachineReadingAnnotations.EntityMentionsAnnotation.class);
            List<RelationMention> relations = sentence.get(MachineReadingAnnotations.RelationMentionsAnnotation.class);
            if (entities != null && !entities.isEmpty()) {
                Element mrElem = new Element("MachineReading", NAMESPACE_URI);
                Element entElem = new Element("entities", NAMESPACE_URI);
                addEntities(entities, entElem, NAMESPACE_URI);
                mrElem.appendChild(entElem);
                if (relations != null) {
                    Element relElem = new Element("relations", NAMESPACE_URI);
                    addRelations(relations, relElem, NAMESPACE_URI, options.relationsBeam);
                    mrElem.appendChild(relElem);
                }
                sentElem.appendChild(mrElem);
            }
            /**
         * Adds sentiment as an attribute of this sentence.
         */
            Tree sentimentTree = sentence.get(SentimentCoreAnnotations.SentimentAnnotatedTree.class);
            if (sentimentTree != null) {
                int sentiment = RNNCoreAnnotations.getPredictedClass(sentimentTree);
                sentElem.addAttribute(new Attribute("sentimentValue", Integer.toString(sentiment)));
                String sentimentClass = sentence.get(SentimentCoreAnnotations.SentimentClass.class);
                sentElem.addAttribute(new Attribute("sentiment", sentimentClass.replaceAll(" ", "")));
            }
            // add the sentence to the root
            sentencesElem.appendChild(sentElem);
        }
    }
    //
    // add the coref graph
    //
    Map<Integer, CorefChain> corefChains = annotation.get(CorefCoreAnnotations.CorefChainAnnotation.class);
    if (corefChains != null) {
        List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
        Element corefInfo = new Element("coreference", NAMESPACE_URI);
        if (addCorefGraphInfo(options, corefInfo, sentences, corefChains, NAMESPACE_URI))
            docElem.appendChild(corefInfo);
    }
    return xmlDoc;
}
Also used : RelationMention(edu.stanford.nlp.ie.machinereading.structure.RelationMention) CorefCoreAnnotations(edu.stanford.nlp.coref.CorefCoreAnnotations) MachineReadingAnnotations(edu.stanford.nlp.ie.machinereading.structure.MachineReadingAnnotations) EntityMention(edu.stanford.nlp.ie.machinereading.structure.EntityMention) CorefChain(edu.stanford.nlp.coref.data.CorefChain) RelationTriple(edu.stanford.nlp.ie.util.RelationTriple) Tree(edu.stanford.nlp.trees.Tree) NaturalLogicAnnotations(edu.stanford.nlp.naturalli.NaturalLogicAnnotations) SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) TreeCoreAnnotations(edu.stanford.nlp.trees.TreeCoreAnnotations) TreePrint(edu.stanford.nlp.trees.TreePrint) SentimentCoreAnnotations(edu.stanford.nlp.sentiment.SentimentCoreAnnotations) CoreLabel(edu.stanford.nlp.ling.CoreLabel) SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) RNNCoreAnnotations(edu.stanford.nlp.neural.rnn.RNNCoreAnnotations) TreeCoreAnnotations(edu.stanford.nlp.trees.TreeCoreAnnotations) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) SentimentCoreAnnotations(edu.stanford.nlp.sentiment.SentimentCoreAnnotations) CorefCoreAnnotations(edu.stanford.nlp.coref.CorefCoreAnnotations) SemanticGraph(edu.stanford.nlp.semgraph.SemanticGraph) CoreMap(edu.stanford.nlp.util.CoreMap)

Example 24 with CoreLabel

use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.

the class TokensRegexNERAnnotator method annotateMatched.

private void annotateMatched(List<CoreLabel> tokens) {
    List<SequenceMatchResult<CoreMap>> matched = multiPatternMatcher.findNonOverlapping(tokens);
    for (SequenceMatchResult<CoreMap> m : matched) {
        Entry entry = patternToEntry.get(m.pattern());
        // Check if we will overwrite the existing annotation with this annotation
        int g = entry.annotateGroup;
        int start = m.start(g);
        int end = m.end(g);
        String str = m.group(g);
        if (commonWords.contains(str)) {
            if (verbose) {
                log.info("Not annotating (common word) '" + str + "': " + StringUtils.joinFields(m.groupNodes(g), CoreAnnotations.NamedEntityTagAnnotation.class) + " with " + entry.getTypeDescription() + ", sentence is '" + StringUtils.joinWords(tokens, " ") + "'");
            }
            continue;
        }
        boolean overwriteOriginalNer = checkPosTags(tokens, start, end);
        if (overwriteOriginalNer) {
            overwriteOriginalNer = checkOrigNerTags(entry, tokens, start, end);
        }
        if (overwriteOriginalNer) {
            for (int i = start; i < end; i++) {
                CoreLabel token = tokens.get(i);
                for (int j = 0; j < annotationFields.size(); j++) {
                    token.set(annotationFields.get(j), entry.types[j]);
                }
            // tokens.get(i).set(CoreAnnotations.NamedEntityTagAnnotation.class, entry.type);
            }
        } else {
            if (verbose) {
                log.info("Not annotating  '" + m.group(g) + "': " + StringUtils.joinFields(m.groupNodes(g), CoreAnnotations.NamedEntityTagAnnotation.class) + " with " + entry.getTypeDescription() + ", sentence is '" + StringUtils.joinWords(tokens, " ") + "'");
            }
        }
    }
}
Also used : CoreLabel(edu.stanford.nlp.ling.CoreLabel)

Example 25 with CoreLabel

use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.

the class TrueCaseAnnotator method annotate.

@Override
public void annotate(Annotation annotation) {
    if (verbose) {
        log.info("Adding true-case annotation...");
    }
    if (annotation.containsKey(CoreAnnotations.SentencesAnnotation.class)) {
        // classify tokens for each sentence
        for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
            List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
            List<CoreLabel> output = this.trueCaser.classifySentence(tokens);
            for (int i = 0, size = tokens.size(); i < size; i++) {
                // add the truecaser tag to each token
                String neTag = output.get(i).get(CoreAnnotations.AnswerAnnotation.class);
                tokens.get(i).set(CoreAnnotations.TrueCaseAnnotation.class, neTag);
                setTrueCaseText(tokens.get(i));
            }
        }
    } else {
        throw new RuntimeException("unable to find sentences in: " + annotation);
    }
}
Also used : CoreLabel(edu.stanford.nlp.ling.CoreLabel) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) CoreMap(edu.stanford.nlp.util.CoreMap)

Aggregations

CoreLabel (edu.stanford.nlp.ling.CoreLabel)533 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)310 CoreMap (edu.stanford.nlp.util.CoreMap)102 ArrayList (java.util.ArrayList)101 Tree (edu.stanford.nlp.trees.Tree)98 SemanticGraphCoreAnnotations (edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations)96 TreeCoreAnnotations (edu.stanford.nlp.trees.TreeCoreAnnotations)63 SemanticGraph (edu.stanford.nlp.semgraph.SemanticGraph)53 ParserConstraint (edu.stanford.nlp.parser.common.ParserConstraint)41 IndexedWord (edu.stanford.nlp.ling.IndexedWord)38 List (java.util.List)33 Annotation (edu.stanford.nlp.pipeline.Annotation)31 Mention (edu.stanford.nlp.coref.data.Mention)29 Label (edu.stanford.nlp.ling.Label)28 ClassicCounter (edu.stanford.nlp.stats.ClassicCounter)26 Properties (java.util.Properties)24 CorefCoreAnnotations (edu.stanford.nlp.coref.CorefCoreAnnotations)21 CoreAnnotation (edu.stanford.nlp.ling.CoreAnnotation)19 SemanticGraphEdge (edu.stanford.nlp.semgraph.SemanticGraphEdge)18 StringReader (java.io.StringReader)18