use of edu.stanford.nlp.util.CoreMap in project CoreNLP by stanfordnlp.
the class WordsToSentencesAnnotator method annotate.
/**
* If setCountLineNumbers is set to true, we count line numbers by
* telling the underlying splitter to return empty lists of tokens
* and then treating those empty lists as empty lines. We don't
* actually include empty sentences in the annotation, though.
**/
@Override
public void annotate(Annotation annotation) {
if (VERBOSE) {
log.info("Sentence splitting ...");
}
if (!annotation.containsKey(CoreAnnotations.TokensAnnotation.class)) {
throw new IllegalArgumentException("WordsToSentencesAnnotator: unable to find words/tokens in: " + annotation);
}
// get text and tokens from the document
String text = annotation.get(CoreAnnotations.TextAnnotation.class);
List<CoreLabel> tokens = annotation.get(CoreAnnotations.TokensAnnotation.class);
String docID = annotation.get(CoreAnnotations.DocIDAnnotation.class);
// log.info("Tokens are: " + tokens);
// assemble the sentence annotations
int tokenOffset = 0;
int lineNumber = 0;
// section annotations to mark sentences with
CoreMap sectionAnnotations = null;
List<CoreMap> sentences = new ArrayList<>();
for (List<CoreLabel> sentenceTokens : wts.process(tokens)) {
if (countLineNumbers) {
++lineNumber;
}
if (sentenceTokens.isEmpty()) {
if (!countLineNumbers) {
throw new IllegalStateException("unexpected empty sentence: " + sentenceTokens);
} else {
continue;
}
}
// get the sentence text from the first and last character offsets
int begin = sentenceTokens.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
int last = sentenceTokens.size() - 1;
int end = sentenceTokens.get(last).get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
String sentenceText = text.substring(begin, end);
// create a sentence annotation with text and token offsets
Annotation sentence = new Annotation(sentenceText);
sentence.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, begin);
sentence.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, end);
sentence.set(CoreAnnotations.TokensAnnotation.class, sentenceTokens);
sentence.set(CoreAnnotations.TokenBeginAnnotation.class, tokenOffset);
tokenOffset += sentenceTokens.size();
sentence.set(CoreAnnotations.TokenEndAnnotation.class, tokenOffset);
sentence.set(CoreAnnotations.SentenceIndexAnnotation.class, sentences.size());
if (countLineNumbers) {
sentence.set(CoreAnnotations.LineNumberAnnotation.class, lineNumber);
}
// Annotate sentence with section information.
// Assume section start and end appear as first and last tokens of sentence
CoreLabel sentenceStartToken = sentenceTokens.get(0);
CoreLabel sentenceEndToken = sentenceTokens.get(sentenceTokens.size() - 1);
CoreMap sectionStart = sentenceStartToken.get(CoreAnnotations.SectionStartAnnotation.class);
if (sectionStart != null) {
// Section is started
sectionAnnotations = sectionStart;
}
if (sectionAnnotations != null) {
// transfer annotations over to sentence
ChunkAnnotationUtils.copyUnsetAnnotations(sectionAnnotations, sentence);
}
String sectionEnd = sentenceEndToken.get(CoreAnnotations.SectionEndAnnotation.class);
if (sectionEnd != null) {
sectionAnnotations = null;
}
if (docID != null) {
sentence.set(CoreAnnotations.DocIDAnnotation.class, docID);
}
int index = 1;
for (CoreLabel token : sentenceTokens) {
token.setIndex(index++);
token.setSentIndex(sentences.size());
if (docID != null) {
token.setDocID(docID);
}
}
// add the sentence to the list
sentences.add(sentence);
}
// the condition below is possible if sentenceBoundaryToDiscard is initialized!
/*
if (tokenOffset != tokens.size()) {
throw new RuntimeException(String.format(
"expected %d tokens, found %d", tokens.size(), tokenOffset));
}
*/
// add the sentences annotations to the document
annotation.set(CoreAnnotations.SentencesAnnotation.class, sentences);
}
use of edu.stanford.nlp.util.CoreMap in project CoreNLP by stanfordnlp.
the class XMLOutputter method annotationToDoc.
/**
* Converts the given annotation to an XML document using the specified options
*/
public static Document annotationToDoc(Annotation annotation, Options options) {
//
// create the XML document with the root node pointing to the namespace URL
//
Element root = new Element("root", NAMESPACE_URI);
Document xmlDoc = new Document(root);
ProcessingInstruction pi = new ProcessingInstruction("xml-stylesheet", "href=\"" + STYLESHEET_NAME + "\" type=\"text/xsl\"");
xmlDoc.insertChild(pi, 0);
Element docElem = new Element("document", NAMESPACE_URI);
root.appendChild(docElem);
setSingleElement(docElem, "docId", NAMESPACE_URI, annotation.get(CoreAnnotations.DocIDAnnotation.class));
setSingleElement(docElem, "docDate", NAMESPACE_URI, annotation.get(CoreAnnotations.DocDateAnnotation.class));
setSingleElement(docElem, "docSourceType", NAMESPACE_URI, annotation.get(CoreAnnotations.DocSourceTypeAnnotation.class));
setSingleElement(docElem, "docType", NAMESPACE_URI, annotation.get(CoreAnnotations.DocTypeAnnotation.class));
setSingleElement(docElem, "author", NAMESPACE_URI, annotation.get(CoreAnnotations.AuthorAnnotation.class));
setSingleElement(docElem, "location", NAMESPACE_URI, annotation.get(CoreAnnotations.LocationAnnotation.class));
if (options.includeText) {
setSingleElement(docElem, "text", NAMESPACE_URI, annotation.get(CoreAnnotations.TextAnnotation.class));
}
Element sentencesElem = new Element("sentences", NAMESPACE_URI);
docElem.appendChild(sentencesElem);
//
if (annotation.get(CoreAnnotations.SentencesAnnotation.class) != null) {
int sentCount = 1;
for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
Element sentElem = new Element("sentence", NAMESPACE_URI);
sentElem.addAttribute(new Attribute("id", Integer.toString(sentCount)));
Integer lineNumber = sentence.get(CoreAnnotations.LineNumberAnnotation.class);
if (lineNumber != null) {
sentElem.addAttribute(new Attribute("line", Integer.toString(lineNumber)));
}
sentCount++;
// add the word table with all token-level annotations
Element wordTable = new Element("tokens", NAMESPACE_URI);
List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
for (int j = 0; j < tokens.size(); j++) {
Element wordInfo = new Element("token", NAMESPACE_URI);
addWordInfo(wordInfo, tokens.get(j), j + 1, NAMESPACE_URI);
wordTable.appendChild(wordInfo);
}
sentElem.appendChild(wordTable);
// add tree info
Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class);
if (tree != null) {
// add the constituent tree for this sentence
Element parseInfo = new Element("parse", NAMESPACE_URI);
addConstituentTreeInfo(parseInfo, tree, options.constituentTreePrinter);
sentElem.appendChild(parseInfo);
}
SemanticGraph basicDependencies = sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
if (basicDependencies != null) {
// add the dependencies for this sentence
Element depInfo = buildDependencyTreeInfo("basic-dependencies", sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class), tokens, NAMESPACE_URI);
if (depInfo != null) {
sentElem.appendChild(depInfo);
}
depInfo = buildDependencyTreeInfo("collapsed-dependencies", sentence.get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class), tokens, NAMESPACE_URI);
if (depInfo != null) {
sentElem.appendChild(depInfo);
}
depInfo = buildDependencyTreeInfo("collapsed-ccprocessed-dependencies", sentence.get(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class), tokens, NAMESPACE_URI);
if (depInfo != null) {
sentElem.appendChild(depInfo);
}
depInfo = buildDependencyTreeInfo("enhanced-dependencies", sentence.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class), tokens, NAMESPACE_URI);
if (depInfo != null) {
sentElem.appendChild(depInfo);
}
depInfo = buildDependencyTreeInfo("enhanced-plus-plus-dependencies", sentence.get(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation.class), tokens, NAMESPACE_URI);
if (depInfo != null) {
sentElem.appendChild(depInfo);
}
}
// add Open IE triples
Collection<RelationTriple> openieTriples = sentence.get(NaturalLogicAnnotations.RelationTriplesAnnotation.class);
if (openieTriples != null) {
Element openieElem = new Element("openie", NAMESPACE_URI);
addTriples(openieTriples, openieElem, NAMESPACE_URI);
sentElem.appendChild(openieElem);
}
// add KBP triples
Collection<RelationTriple> kbpTriples = sentence.get(CoreAnnotations.KBPTriplesAnnotation.class);
if (kbpTriples != null) {
Element kbpElem = new Element("kbp", NAMESPACE_URI);
addTriples(kbpTriples, kbpElem, NAMESPACE_URI);
sentElem.appendChild(kbpElem);
}
// add the MR entities and relations
List<EntityMention> entities = sentence.get(MachineReadingAnnotations.EntityMentionsAnnotation.class);
List<RelationMention> relations = sentence.get(MachineReadingAnnotations.RelationMentionsAnnotation.class);
if (entities != null && !entities.isEmpty()) {
Element mrElem = new Element("MachineReading", NAMESPACE_URI);
Element entElem = new Element("entities", NAMESPACE_URI);
addEntities(entities, entElem, NAMESPACE_URI);
mrElem.appendChild(entElem);
if (relations != null) {
Element relElem = new Element("relations", NAMESPACE_URI);
addRelations(relations, relElem, NAMESPACE_URI, options.relationsBeam);
mrElem.appendChild(relElem);
}
sentElem.appendChild(mrElem);
}
/**
* Adds sentiment as an attribute of this sentence.
*/
Tree sentimentTree = sentence.get(SentimentCoreAnnotations.SentimentAnnotatedTree.class);
if (sentimentTree != null) {
int sentiment = RNNCoreAnnotations.getPredictedClass(sentimentTree);
sentElem.addAttribute(new Attribute("sentimentValue", Integer.toString(sentiment)));
String sentimentClass = sentence.get(SentimentCoreAnnotations.SentimentClass.class);
sentElem.addAttribute(new Attribute("sentiment", sentimentClass.replaceAll(" ", "")));
}
// add the sentence to the root
sentencesElem.appendChild(sentElem);
}
}
//
// add the coref graph
//
Map<Integer, CorefChain> corefChains = annotation.get(CorefCoreAnnotations.CorefChainAnnotation.class);
if (corefChains != null) {
List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
Element corefInfo = new Element("coreference", NAMESPACE_URI);
if (addCorefGraphInfo(options, corefInfo, sentences, corefChains, NAMESPACE_URI))
docElem.appendChild(corefInfo);
}
return xmlDoc;
}
use of edu.stanford.nlp.util.CoreMap in project CoreNLP by stanfordnlp.
the class WordToSentenceProcessor method wordsToSentences.
/**
* Returns a List of Lists where each element is built from a run
* of Words in the input Document. Specifically, reads through each word in
* the input document and breaks off a sentence after finding a valid
* sentence boundary token or end of file.
* Note that for this to work, the words in the
* input document must have been tokenized with a tokenizer that makes
* sentence boundary tokens their own tokens (e.g., {@link PTBTokenizer}).
*
* @param words A list of already tokenized words (must implement HasWord or be a String).
* @return A list of sentences.
* @see #WordToSentenceProcessor(String, String, Set, Set, String, NewlineIsSentenceBreak, SequencePattern, Set, boolean, boolean)
*/
public List<List<IN>> wordsToSentences(List<? extends IN> words) {
// is null unless used by sentenceBoundaryMultiTokenPattern
IdentityHashMap<Object, Boolean> isSentenceBoundary = null;
if (sentenceBoundaryMultiTokenPattern != null) {
// Do initial pass using tokensregex to identify multi token patterns that need to be matched
// and add the last token to our table of sentence boundary tokens
isSentenceBoundary = new IdentityHashMap<>();
SequenceMatcher<? super IN> matcher = sentenceBoundaryMultiTokenPattern.getMatcher(words);
while (matcher.find()) {
List nodes = matcher.groupNodes();
if (nodes != null && !nodes.isEmpty()) {
isSentenceBoundary.put(nodes.get(nodes.size() - 1), true);
}
}
}
// Split tokens into sentences!!!
List<List<IN>> sentences = Generics.newArrayList();
List<IN> currentSentence = new ArrayList<>();
List<IN> lastSentence = null;
boolean insideRegion = false;
boolean inWaitForForcedEnd = false;
boolean lastTokenWasNewline = false;
for (IN o : words) {
String word = getString(o);
boolean forcedEnd = isForcedEndToken(o);
boolean inMultiTokenExpr = false;
boolean discardToken = false;
if (o instanceof CoreMap) {
// Hacky stuff to ensure sentence breaks do not happen in certain cases
CoreMap cm = (CoreMap) o;
Boolean forcedUntilEndValue = cm.get(CoreAnnotations.ForcedSentenceUntilEndAnnotation.class);
if (!forcedEnd) {
if (forcedUntilEndValue != null && forcedUntilEndValue)
inWaitForForcedEnd = true;
else {
MultiTokenTag mt = cm.get(CoreAnnotations.MentionTokenAnnotation.class);
if (mt != null && !mt.isEnd()) {
// In the middle of a multi token mention, make sure sentence is not ended here
inMultiTokenExpr = true;
}
}
}
}
if (tokenPatternsToDiscard != null) {
discardToken = matchesTokenPatternsToDiscard(word);
}
if (sentenceRegionBeginPattern != null && !insideRegion) {
if (DEBUG) {
log.info("Word is " + word + "; outside region; deleted");
}
if (sentenceRegionBeginPattern.matcher(word).matches()) {
insideRegion = true;
if (DEBUG) {
log.info(" entering region");
}
}
lastTokenWasNewline = false;
continue;
}
if (lastSentence != null && currentSentence.isEmpty() && sentenceBoundaryFollowersPattern.matcher(word).matches()) {
if (!discardToken) {
lastSentence.add(o);
}
if (DEBUG) {
log.info("Word is " + word + (discardToken ? "discarded" : " added to last sentence"));
}
lastTokenWasNewline = false;
continue;
}
boolean newSent = false;
String debugText = (discardToken) ? "discarded" : "added to current";
if (inWaitForForcedEnd && !forcedEnd) {
if (!discardToken)
currentSentence.add(o);
if (DEBUG) {
log.info("Word is " + word + "; is in wait for forced end; " + debugText);
}
} else if (inMultiTokenExpr && !forcedEnd) {
if (!discardToken)
currentSentence.add(o);
if (DEBUG) {
log.info("Word is " + word + "; is in multi token expr; " + debugText);
}
} else if (sentenceBoundaryToDiscard.contains(word)) {
if (newlineIsSentenceBreak == NewlineIsSentenceBreak.ALWAYS) {
newSent = true;
} else if (newlineIsSentenceBreak == NewlineIsSentenceBreak.TWO_CONSECUTIVE) {
if (lastTokenWasNewline) {
newSent = true;
}
}
lastTokenWasNewline = true;
if (DEBUG) {
log.info("Word is " + word + " discarded sentence boundary");
}
} else {
lastTokenWasNewline = false;
Boolean isb;
if (xmlBreakElementsToDiscard != null && matchesXmlBreakElementToDiscard(word)) {
newSent = true;
if (DEBUG) {
log.info("Word is " + word + "; is XML break element; discarded");
}
} else if (sentenceRegionEndPattern != null && sentenceRegionEndPattern.matcher(word).matches()) {
insideRegion = false;
newSent = true;
// Marked sentence boundaries
} else if ((isSentenceBoundary != null) && ((isb = isSentenceBoundary.get(o)) != null) && isb) {
if (!discardToken)
currentSentence.add(o);
if (DEBUG) {
log.info("Word is " + word + "; is sentence boundary (matched multi-token pattern); " + debugText);
}
newSent = true;
} else if (sentenceBoundaryTokenPattern.matcher(word).matches()) {
if (!discardToken)
currentSentence.add(o);
if (DEBUG) {
log.info("Word is " + word + "; is sentence boundary; " + debugText);
}
newSent = true;
} else if (forcedEnd) {
if (!discardToken)
currentSentence.add(o);
inWaitForForcedEnd = false;
newSent = true;
if (DEBUG) {
log.info("Word is " + word + "; annotated to be the end of a sentence; " + debugText);
}
} else {
if (!discardToken)
currentSentence.add(o);
if (DEBUG) {
log.info("Word is " + word + "; " + debugText);
}
}
}
if (newSent && (!currentSentence.isEmpty() || allowEmptySentences)) {
if (DEBUG) {
log.info(" beginning new sentence");
}
sentences.add(currentSentence);
// adds this sentence now that it's complete
lastSentence = currentSentence;
// clears the current sentence
currentSentence = new ArrayList<>();
}
}
// terminator at the end of file
if (!currentSentence.isEmpty()) {
// adds last sentence
sentences.add(currentSentence);
}
return sentences;
}
use of edu.stanford.nlp.util.CoreMap in project CoreNLP by stanfordnlp.
the class TrueCaseAnnotator method annotate.
@Override
public void annotate(Annotation annotation) {
if (verbose) {
log.info("Adding true-case annotation...");
}
if (annotation.containsKey(CoreAnnotations.SentencesAnnotation.class)) {
// classify tokens for each sentence
for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
List<CoreLabel> output = this.trueCaser.classifySentence(tokens);
for (int i = 0, size = tokens.size(); i < size; i++) {
// add the truecaser tag to each token
String neTag = output.get(i).get(CoreAnnotations.AnswerAnnotation.class);
tokens.get(i).set(CoreAnnotations.TrueCaseAnnotation.class, neTag);
setTrueCaseText(tokens.get(i));
}
}
} else {
throw new RuntimeException("unable to find sentences in: " + annotation);
}
}
use of edu.stanford.nlp.util.CoreMap in project CoreNLP by stanfordnlp.
the class JSONOutputter method print.
/** {@inheritDoc} */
// It's lying; we need the "redundant" casts (as of 2014-09-08)
@SuppressWarnings("RedundantCast")
@Override
public void print(Annotation doc, OutputStream target, Options options) throws IOException {
PrintWriter writer = new PrintWriter(IOUtils.encodedOutputStreamWriter(target, options.encoding));
JSONWriter l0 = new JSONWriter(writer, options);
l0.object(l1 -> {
l1.set("docId", doc.get(CoreAnnotations.DocIDAnnotation.class));
l1.set("docDate", doc.get(CoreAnnotations.DocDateAnnotation.class));
l1.set("docSourceType", doc.get(CoreAnnotations.DocSourceTypeAnnotation.class));
l1.set("docType", doc.get(CoreAnnotations.DocTypeAnnotation.class));
l1.set("author", doc.get(CoreAnnotations.AuthorAnnotation.class));
l1.set("location", doc.get(CoreAnnotations.LocationAnnotation.class));
if (options.includeText) {
l1.set("text", doc.get(CoreAnnotations.TextAnnotation.class));
}
if (doc.get(CoreAnnotations.SentencesAnnotation.class) != null) {
l1.set("sentences", doc.get(CoreAnnotations.SentencesAnnotation.class).stream().map(sentence -> (Consumer<Writer>) (Writer l2) -> {
l2.set("id", sentence.get(CoreAnnotations.SentenceIDAnnotation.class));
l2.set("index", sentence.get(CoreAnnotations.SentenceIndexAnnotation.class));
l2.set("line", sentence.get(CoreAnnotations.LineNumberAnnotation.class));
StringWriter treeStrWriter = new StringWriter();
TreePrint treePrinter = options.constituentTreePrinter;
if (treePrinter == AnnotationOutputter.DEFAULT_CONSTITUENT_TREE_PRINTER) {
treePrinter = new TreePrint("oneline");
}
treePrinter.printTree(sentence.get(TreeCoreAnnotations.TreeAnnotation.class), new PrintWriter(treeStrWriter, true));
String treeStr = treeStrWriter.toString().trim();
if (!"SENTENCE_SKIPPED_OR_UNPARSABLE".equals(treeStr)) {
l2.set("parse", treeStr);
}
l2.set("basicDependencies", buildDependencyTree(sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class)));
l2.set("enhancedDependencies", buildDependencyTree(sentence.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class)));
l2.set("enhancedPlusPlusDependencies", buildDependencyTree(sentence.get(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation.class)));
Tree sentimentTree = sentence.get(SentimentCoreAnnotations.SentimentAnnotatedTree.class);
if (sentimentTree != null) {
int sentiment = RNNCoreAnnotations.getPredictedClass(sentimentTree);
String sentimentClass = sentence.get(SentimentCoreAnnotations.SentimentClass.class);
l2.set("sentimentValue", Integer.toString(sentiment));
l2.set("sentiment", sentimentClass.replaceAll(" ", ""));
}
Collection<RelationTriple> openIETriples = sentence.get(NaturalLogicAnnotations.RelationTriplesAnnotation.class);
if (openIETriples != null) {
l2.set("openie", openIETriples.stream().map(triple -> (Consumer<Writer>) (Writer tripleWriter) -> {
tripleWriter.set("subject", triple.subjectGloss());
tripleWriter.set("subjectSpan", Span.fromPair(triple.subjectTokenSpan()));
tripleWriter.set("relation", triple.relationGloss());
tripleWriter.set("relationSpan", Span.fromPair(triple.relationTokenSpan()));
tripleWriter.set("object", triple.objectGloss());
tripleWriter.set("objectSpan", Span.fromPair(triple.objectTokenSpan()));
}));
}
Collection<RelationTriple> kbpTriples = sentence.get(CoreAnnotations.KBPTriplesAnnotation.class);
if (kbpTriples != null) {
l2.set("kbp", kbpTriples.stream().map(triple -> (Consumer<Writer>) (Writer tripleWriter) -> {
tripleWriter.set("subject", triple.subjectGloss());
tripleWriter.set("subjectSpan", Span.fromPair(triple.subjectTokenSpan()));
tripleWriter.set("relation", triple.relationGloss());
tripleWriter.set("relationSpan", Span.fromPair(triple.relationTokenSpan()));
tripleWriter.set("object", triple.objectGloss());
tripleWriter.set("objectSpan", Span.fromPair(triple.objectTokenSpan()));
}));
}
if (sentence.get(CoreAnnotations.MentionsAnnotation.class) != null) {
Integer sentTokenBegin = sentence.get(CoreAnnotations.TokenBeginAnnotation.class);
l2.set("entitymentions", sentence.get(CoreAnnotations.MentionsAnnotation.class).stream().map(m -> (Consumer<Writer>) (Writer l3) -> {
Integer tokenBegin = m.get(CoreAnnotations.TokenBeginAnnotation.class);
Integer tokenEnd = m.get(CoreAnnotations.TokenEndAnnotation.class);
l3.set("docTokenBegin", tokenBegin);
l3.set("docTokenEnd", tokenEnd);
if (tokenBegin != null && sentTokenBegin != null) {
l3.set("tokenBegin", tokenBegin - sentTokenBegin);
}
if (tokenEnd != null && sentTokenBegin != null) {
l3.set("tokenEnd", tokenEnd - sentTokenBegin);
}
l3.set("text", m.get(CoreAnnotations.TextAnnotation.class));
l3.set("characterOffsetBegin", m.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class));
l3.set("characterOffsetEnd", m.get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
l3.set("ner", m.get(CoreAnnotations.NamedEntityTagAnnotation.class));
l3.set("normalizedNER", m.get(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class));
l3.set("entitylink", m.get(CoreAnnotations.WikipediaEntityAnnotation.class));
Timex time = m.get(TimeAnnotations.TimexAnnotation.class);
if (time != null) {
Timex.Range range = time.range();
l3.set("timex", (Consumer<Writer>) l4 -> {
l4.set("tid", time.tid());
l4.set("type", time.timexType());
l4.set("value", time.value());
l4.set("altValue", time.altVal());
l4.set("range", (range != null) ? (Consumer<Writer>) l5 -> {
l5.set("begin", range.begin);
l5.set("end", range.end);
l5.set("duration", range.duration);
} : null);
});
}
}));
}
if (sentence.get(CoreAnnotations.TokensAnnotation.class) != null) {
l2.set("tokens", sentence.get(CoreAnnotations.TokensAnnotation.class).stream().map(token -> (Consumer<Writer>) (Writer l3) -> {
l3.set("index", token.index());
l3.set("word", token.word());
l3.set("originalText", token.originalText());
l3.set("lemma", token.lemma());
l3.set("characterOffsetBegin", token.beginPosition());
l3.set("characterOffsetEnd", token.endPosition());
l3.set("pos", token.tag());
l3.set("ner", token.ner());
l3.set("normalizedNER", token.get(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class));
l3.set("speaker", token.get(CoreAnnotations.SpeakerAnnotation.class));
l3.set("truecase", token.get(CoreAnnotations.TrueCaseAnnotation.class));
l3.set("truecaseText", token.get(CoreAnnotations.TrueCaseTextAnnotation.class));
l3.set("before", token.get(CoreAnnotations.BeforeAnnotation.class));
l3.set("after", token.get(CoreAnnotations.AfterAnnotation.class));
l3.set("entitylink", token.get(CoreAnnotations.WikipediaEntityAnnotation.class));
Timex time = token.get(TimeAnnotations.TimexAnnotation.class);
if (time != null) {
Timex.Range range = time.range();
l3.set("timex", (Consumer<Writer>) l4 -> {
l4.set("tid", time.tid());
l4.set("type", time.timexType());
l4.set("value", time.value());
l4.set("altValue", time.altVal());
l4.set("range", (range != null) ? (Consumer<Writer>) l5 -> {
l5.set("begin", range.begin);
l5.set("end", range.end);
l5.set("duration", range.duration);
} : null);
});
}
}));
}
}));
}
if (doc.get(CorefCoreAnnotations.CorefChainAnnotation.class) != null) {
Map<Integer, CorefChain> corefChains = doc.get(CorefCoreAnnotations.CorefChainAnnotation.class);
if (corefChains != null) {
l1.set("corefs", (Consumer<Writer>) chainWriter -> {
for (CorefChain chain : corefChains.values()) {
CorefChain.CorefMention representative = chain.getRepresentativeMention();
chainWriter.set(Integer.toString(chain.getChainID()), chain.getMentionsInTextualOrder().stream().map(mention -> (Consumer<Writer>) (Writer mentionWriter) -> {
mentionWriter.set("id", mention.mentionID);
mentionWriter.set("text", mention.mentionSpan);
mentionWriter.set("type", mention.mentionType);
mentionWriter.set("number", mention.number);
mentionWriter.set("gender", mention.gender);
mentionWriter.set("animacy", mention.animacy);
mentionWriter.set("startIndex", mention.startIndex);
mentionWriter.set("endIndex", mention.endIndex);
mentionWriter.set("headIndex", mention.headIndex);
mentionWriter.set("sentNum", mention.sentNum);
mentionWriter.set("position", Arrays.stream(mention.position.elems()).boxed().collect(Collectors.toList()));
mentionWriter.set("isRepresentativeMention", mention == representative);
}));
}
});
}
}
if (doc.get(CoreAnnotations.QuotationsAnnotation.class) != null) {
List<CoreMap> quotes = QuoteAnnotator.gatherQuotes(doc);
l1.set("quotes", quotes.stream().map(quote -> (Consumer<Writer>) (Writer l2) -> {
l2.set("id", quote.get(CoreAnnotations.QuotationIndexAnnotation.class));
l2.set("text", quote.get(CoreAnnotations.TextAnnotation.class));
l2.set("beginIndex", quote.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class));
l2.set("endIndex", quote.get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
l2.set("beginToken", quote.get(CoreAnnotations.TokenBeginAnnotation.class));
l2.set("endToken", quote.get(CoreAnnotations.TokenEndAnnotation.class));
l2.set("beginSentence", quote.get(CoreAnnotations.SentenceBeginAnnotation.class));
l2.set("endSentence", quote.get(CoreAnnotations.SentenceEndAnnotation.class));
}));
}
});
// flush
l0.writer.flush();
}
Aggregations