use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.
the class SpanishTokenizerITest method testOffsetsSpacing.
public void testOffsetsSpacing() {
// guide 1 2 3 4 5 6 7 8 9 0 1 2 3
// guide 0123456789012345678901234567890123456789012345678 90123456789012345678901234567 8 901234567890123456789012345678901234567890123456789012345
String text = " La combinación consonántica ss es ajena a la\tortografía castellana: \n\n traigámosela, mandémoselos, escribámosela, comprémoselo.";
final TokenizerFactory<CoreLabel> tf = SpanishTokenizer.coreLabelFactory();
tf.setOptions("");
tf.setOptions("splitAll=true");
Tokenizer<CoreLabel> spanishTokenizer = tf.getTokenizer(new StringReader(text));
List<CoreLabel> tokens = spanishTokenizer.tokenize();
System.err.println(tokens);
assertEquals(27, tokens.size());
// assertEquals(" ", tokens.get(0).get(CoreAnnotations.BeforeAnnotation.class));
// assertEquals("\t", tokens.get(8).get(CoreAnnotations.AfterAnnotation.class));
assertEquals("Begin char offset", 2, (int) tokens.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class));
assertEquals("End char offset", 4, (int) tokens.get(0).get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
assertEquals("La", tokens.get(0).get(CoreAnnotations.OriginalTextAnnotation.class));
// note: after(x) and before(x+1) are the same
// assertEquals(" ", tokens.get(0).get(CoreAnnotations.AfterAnnotation.class));
// assertEquals(" ", tokens.get(1).get(CoreAnnotations.BeforeAnnotation.class));
assertEquals("escribámo", tokens.get(19).get(CoreAnnotations.OriginalTextAnnotation.class));
assertEquals("escribamos", tokens.get(19).get(CoreAnnotations.TextAnnotation.class));
assertEquals("Begin char offset", 108, (int) tokens.get(19).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class));
assertEquals("End char offset", 117, (int) tokens.get(19).get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
assertEquals("se", tokens.get(20).get(CoreAnnotations.OriginalTextAnnotation.class));
assertEquals("se", tokens.get(20).get(CoreAnnotations.TextAnnotation.class));
assertEquals("Begin char offset", 117, (int) tokens.get(20).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class));
assertEquals("End char offset", 119, (int) tokens.get(20).get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
assertEquals("la", tokens.get(21).get(CoreAnnotations.OriginalTextAnnotation.class));
assertEquals("la", tokens.get(21).get(CoreAnnotations.TextAnnotation.class));
assertEquals("Begin char offset", 119, (int) tokens.get(21).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class));
assertEquals("End char offset", 121, (int) tokens.get(21).get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
assertEquals(",", tokens.get(22).get(CoreAnnotations.OriginalTextAnnotation.class));
assertEquals(",", tokens.get(22).get(CoreAnnotations.TextAnnotation.class));
assertEquals("Begin char offset", 121, (int) tokens.get(22).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class));
assertEquals("End char offset", 122, (int) tokens.get(22).get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
}
use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.
the class WordsToSentencesAnnotator method annotate.
/**
* If setCountLineNumbers is set to true, we count line numbers by
* telling the underlying splitter to return empty lists of tokens
* and then treating those empty lists as empty lines. We don't
* actually include empty sentences in the annotation, though.
**/
@Override
public void annotate(Annotation annotation) {
if (VERBOSE) {
log.info("Sentence splitting ...");
}
if (!annotation.containsKey(CoreAnnotations.TokensAnnotation.class)) {
throw new IllegalArgumentException("WordsToSentencesAnnotator: unable to find words/tokens in: " + annotation);
}
// get text and tokens from the document
String text = annotation.get(CoreAnnotations.TextAnnotation.class);
List<CoreLabel> tokens = annotation.get(CoreAnnotations.TokensAnnotation.class);
String docID = annotation.get(CoreAnnotations.DocIDAnnotation.class);
// log.info("Tokens are: " + tokens);
// assemble the sentence annotations
int tokenOffset = 0;
int lineNumber = 0;
// section annotations to mark sentences with
CoreMap sectionAnnotations = null;
List<CoreMap> sentences = new ArrayList<>();
for (List<CoreLabel> sentenceTokens : wts.process(tokens)) {
if (countLineNumbers) {
++lineNumber;
}
if (sentenceTokens.isEmpty()) {
if (!countLineNumbers) {
throw new IllegalStateException("unexpected empty sentence: " + sentenceTokens);
} else {
continue;
}
}
// get the sentence text from the first and last character offsets
int begin = sentenceTokens.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
int last = sentenceTokens.size() - 1;
int end = sentenceTokens.get(last).get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
String sentenceText = text.substring(begin, end);
// create a sentence annotation with text and token offsets
Annotation sentence = new Annotation(sentenceText);
sentence.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, begin);
sentence.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, end);
sentence.set(CoreAnnotations.TokensAnnotation.class, sentenceTokens);
sentence.set(CoreAnnotations.TokenBeginAnnotation.class, tokenOffset);
tokenOffset += sentenceTokens.size();
sentence.set(CoreAnnotations.TokenEndAnnotation.class, tokenOffset);
sentence.set(CoreAnnotations.SentenceIndexAnnotation.class, sentences.size());
if (countLineNumbers) {
sentence.set(CoreAnnotations.LineNumberAnnotation.class, lineNumber);
}
// Annotate sentence with section information.
// Assume section start and end appear as first and last tokens of sentence
CoreLabel sentenceStartToken = sentenceTokens.get(0);
CoreLabel sentenceEndToken = sentenceTokens.get(sentenceTokens.size() - 1);
CoreMap sectionStart = sentenceStartToken.get(CoreAnnotations.SectionStartAnnotation.class);
if (sectionStart != null) {
// Section is started
sectionAnnotations = sectionStart;
}
if (sectionAnnotations != null) {
// transfer annotations over to sentence
ChunkAnnotationUtils.copyUnsetAnnotations(sectionAnnotations, sentence);
}
String sectionEnd = sentenceEndToken.get(CoreAnnotations.SectionEndAnnotation.class);
if (sectionEnd != null) {
sectionAnnotations = null;
}
if (docID != null) {
sentence.set(CoreAnnotations.DocIDAnnotation.class, docID);
}
int index = 1;
for (CoreLabel token : sentenceTokens) {
token.setIndex(index++);
token.setSentIndex(sentences.size());
if (docID != null) {
token.setDocID(docID);
}
}
// add the sentence to the list
sentences.add(sentence);
}
// the condition below is possible if sentenceBoundaryToDiscard is initialized!
/*
if (tokenOffset != tokens.size()) {
throw new RuntimeException(String.format(
"expected %d tokens, found %d", tokens.size(), tokenOffset));
}
*/
// add the sentences annotations to the document
annotation.set(CoreAnnotations.SentencesAnnotation.class, sentences);
}
use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.
the class XMLOutputter method annotationToDoc.
/**
* Converts the given annotation to an XML document using the specified options
*/
public static Document annotationToDoc(Annotation annotation, Options options) {
//
// create the XML document with the root node pointing to the namespace URL
//
Element root = new Element("root", NAMESPACE_URI);
Document xmlDoc = new Document(root);
ProcessingInstruction pi = new ProcessingInstruction("xml-stylesheet", "href=\"" + STYLESHEET_NAME + "\" type=\"text/xsl\"");
xmlDoc.insertChild(pi, 0);
Element docElem = new Element("document", NAMESPACE_URI);
root.appendChild(docElem);
setSingleElement(docElem, "docId", NAMESPACE_URI, annotation.get(CoreAnnotations.DocIDAnnotation.class));
setSingleElement(docElem, "docDate", NAMESPACE_URI, annotation.get(CoreAnnotations.DocDateAnnotation.class));
setSingleElement(docElem, "docSourceType", NAMESPACE_URI, annotation.get(CoreAnnotations.DocSourceTypeAnnotation.class));
setSingleElement(docElem, "docType", NAMESPACE_URI, annotation.get(CoreAnnotations.DocTypeAnnotation.class));
setSingleElement(docElem, "author", NAMESPACE_URI, annotation.get(CoreAnnotations.AuthorAnnotation.class));
setSingleElement(docElem, "location", NAMESPACE_URI, annotation.get(CoreAnnotations.LocationAnnotation.class));
if (options.includeText) {
setSingleElement(docElem, "text", NAMESPACE_URI, annotation.get(CoreAnnotations.TextAnnotation.class));
}
Element sentencesElem = new Element("sentences", NAMESPACE_URI);
docElem.appendChild(sentencesElem);
//
if (annotation.get(CoreAnnotations.SentencesAnnotation.class) != null) {
int sentCount = 1;
for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
Element sentElem = new Element("sentence", NAMESPACE_URI);
sentElem.addAttribute(new Attribute("id", Integer.toString(sentCount)));
Integer lineNumber = sentence.get(CoreAnnotations.LineNumberAnnotation.class);
if (lineNumber != null) {
sentElem.addAttribute(new Attribute("line", Integer.toString(lineNumber)));
}
sentCount++;
// add the word table with all token-level annotations
Element wordTable = new Element("tokens", NAMESPACE_URI);
List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
for (int j = 0; j < tokens.size(); j++) {
Element wordInfo = new Element("token", NAMESPACE_URI);
addWordInfo(wordInfo, tokens.get(j), j + 1, NAMESPACE_URI);
wordTable.appendChild(wordInfo);
}
sentElem.appendChild(wordTable);
// add tree info
Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class);
if (tree != null) {
// add the constituent tree for this sentence
Element parseInfo = new Element("parse", NAMESPACE_URI);
addConstituentTreeInfo(parseInfo, tree, options.constituentTreePrinter);
sentElem.appendChild(parseInfo);
}
SemanticGraph basicDependencies = sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
if (basicDependencies != null) {
// add the dependencies for this sentence
Element depInfo = buildDependencyTreeInfo("basic-dependencies", sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class), tokens, NAMESPACE_URI);
if (depInfo != null) {
sentElem.appendChild(depInfo);
}
depInfo = buildDependencyTreeInfo("collapsed-dependencies", sentence.get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class), tokens, NAMESPACE_URI);
if (depInfo != null) {
sentElem.appendChild(depInfo);
}
depInfo = buildDependencyTreeInfo("collapsed-ccprocessed-dependencies", sentence.get(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class), tokens, NAMESPACE_URI);
if (depInfo != null) {
sentElem.appendChild(depInfo);
}
depInfo = buildDependencyTreeInfo("enhanced-dependencies", sentence.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class), tokens, NAMESPACE_URI);
if (depInfo != null) {
sentElem.appendChild(depInfo);
}
depInfo = buildDependencyTreeInfo("enhanced-plus-plus-dependencies", sentence.get(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation.class), tokens, NAMESPACE_URI);
if (depInfo != null) {
sentElem.appendChild(depInfo);
}
}
// add Open IE triples
Collection<RelationTriple> openieTriples = sentence.get(NaturalLogicAnnotations.RelationTriplesAnnotation.class);
if (openieTriples != null) {
Element openieElem = new Element("openie", NAMESPACE_URI);
addTriples(openieTriples, openieElem, NAMESPACE_URI);
sentElem.appendChild(openieElem);
}
// add KBP triples
Collection<RelationTriple> kbpTriples = sentence.get(CoreAnnotations.KBPTriplesAnnotation.class);
if (kbpTriples != null) {
Element kbpElem = new Element("kbp", NAMESPACE_URI);
addTriples(kbpTriples, kbpElem, NAMESPACE_URI);
sentElem.appendChild(kbpElem);
}
// add the MR entities and relations
List<EntityMention> entities = sentence.get(MachineReadingAnnotations.EntityMentionsAnnotation.class);
List<RelationMention> relations = sentence.get(MachineReadingAnnotations.RelationMentionsAnnotation.class);
if (entities != null && !entities.isEmpty()) {
Element mrElem = new Element("MachineReading", NAMESPACE_URI);
Element entElem = new Element("entities", NAMESPACE_URI);
addEntities(entities, entElem, NAMESPACE_URI);
mrElem.appendChild(entElem);
if (relations != null) {
Element relElem = new Element("relations", NAMESPACE_URI);
addRelations(relations, relElem, NAMESPACE_URI, options.relationsBeam);
mrElem.appendChild(relElem);
}
sentElem.appendChild(mrElem);
}
/**
* Adds sentiment as an attribute of this sentence.
*/
Tree sentimentTree = sentence.get(SentimentCoreAnnotations.SentimentAnnotatedTree.class);
if (sentimentTree != null) {
int sentiment = RNNCoreAnnotations.getPredictedClass(sentimentTree);
sentElem.addAttribute(new Attribute("sentimentValue", Integer.toString(sentiment)));
String sentimentClass = sentence.get(SentimentCoreAnnotations.SentimentClass.class);
sentElem.addAttribute(new Attribute("sentiment", sentimentClass.replaceAll(" ", "")));
}
// add the sentence to the root
sentencesElem.appendChild(sentElem);
}
}
//
// add the coref graph
//
Map<Integer, CorefChain> corefChains = annotation.get(CorefCoreAnnotations.CorefChainAnnotation.class);
if (corefChains != null) {
List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
Element corefInfo = new Element("coreference", NAMESPACE_URI);
if (addCorefGraphInfo(options, corefInfo, sentences, corefChains, NAMESPACE_URI))
docElem.appendChild(corefInfo);
}
return xmlDoc;
}
use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.
the class TokensRegexNERAnnotator method annotateMatched.
private void annotateMatched(List<CoreLabel> tokens) {
List<SequenceMatchResult<CoreMap>> matched = multiPatternMatcher.findNonOverlapping(tokens);
for (SequenceMatchResult<CoreMap> m : matched) {
Entry entry = patternToEntry.get(m.pattern());
// Check if we will overwrite the existing annotation with this annotation
int g = entry.annotateGroup;
int start = m.start(g);
int end = m.end(g);
String str = m.group(g);
if (commonWords.contains(str)) {
if (verbose) {
log.info("Not annotating (common word) '" + str + "': " + StringUtils.joinFields(m.groupNodes(g), CoreAnnotations.NamedEntityTagAnnotation.class) + " with " + entry.getTypeDescription() + ", sentence is '" + StringUtils.joinWords(tokens, " ") + "'");
}
continue;
}
boolean overwriteOriginalNer = checkPosTags(tokens, start, end);
if (overwriteOriginalNer) {
overwriteOriginalNer = checkOrigNerTags(entry, tokens, start, end);
}
if (overwriteOriginalNer) {
for (int i = start; i < end; i++) {
CoreLabel token = tokens.get(i);
for (int j = 0; j < annotationFields.size(); j++) {
token.set(annotationFields.get(j), entry.types[j]);
}
// tokens.get(i).set(CoreAnnotations.NamedEntityTagAnnotation.class, entry.type);
}
} else {
if (verbose) {
log.info("Not annotating '" + m.group(g) + "': " + StringUtils.joinFields(m.groupNodes(g), CoreAnnotations.NamedEntityTagAnnotation.class) + " with " + entry.getTypeDescription() + ", sentence is '" + StringUtils.joinWords(tokens, " ") + "'");
}
}
}
}
use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.
the class TrueCaseAnnotator method annotate.
@Override
public void annotate(Annotation annotation) {
if (verbose) {
log.info("Adding true-case annotation...");
}
if (annotation.containsKey(CoreAnnotations.SentencesAnnotation.class)) {
// classify tokens for each sentence
for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
List<CoreLabel> output = this.trueCaser.classifySentence(tokens);
for (int i = 0, size = tokens.size(); i < size; i++) {
// add the truecaser tag to each token
String neTag = output.get(i).get(CoreAnnotations.AnswerAnnotation.class);
tokens.get(i).set(CoreAnnotations.TrueCaseAnnotation.class, neTag);
setTrueCaseText(tokens.get(i));
}
}
} else {
throw new RuntimeException("unable to find sentences in: " + annotation);
}
}
Aggregations