use of edu.stanford.nlp.ling.MultiTokenTag in project CoreNLP by stanfordnlp.
the class WordToSentenceProcessor method wordsToSentences.
/**
* Returns a List of Lists where each element is built from a run
* of Words in the input Document. Specifically, reads through each word in
* the input document and breaks off a sentence after finding a valid
* sentence boundary token or end of file.
* Note that for this to work, the words in the
* input document must have been tokenized with a tokenizer that makes
* sentence boundary tokens their own tokens (e.g., {@link PTBTokenizer}).
*
* @param words A list of already tokenized words (must implement HasWord or be a String).
* @return A list of sentences.
* @see #WordToSentenceProcessor(String, String, Set, Set, String, NewlineIsSentenceBreak, SequencePattern, Set, boolean, boolean)
*/
public List<List<IN>> wordsToSentences(List<? extends IN> words) {
// is null unless used by sentenceBoundaryMultiTokenPattern
IdentityHashMap<Object, Boolean> isSentenceBoundary = null;
if (sentenceBoundaryMultiTokenPattern != null) {
// Do initial pass using tokensregex to identify multi token patterns that need to be matched
// and add the last token to our table of sentence boundary tokens
isSentenceBoundary = new IdentityHashMap<>();
SequenceMatcher<? super IN> matcher = sentenceBoundaryMultiTokenPattern.getMatcher(words);
while (matcher.find()) {
List nodes = matcher.groupNodes();
if (nodes != null && !nodes.isEmpty()) {
isSentenceBoundary.put(nodes.get(nodes.size() - 1), true);
}
}
}
// Split tokens into sentences!!!
List<List<IN>> sentences = Generics.newArrayList();
List<IN> currentSentence = new ArrayList<>();
List<IN> lastSentence = null;
boolean insideRegion = false;
boolean inWaitForForcedEnd = false;
boolean lastTokenWasNewline = false;
for (IN o : words) {
String word = getString(o);
boolean forcedEnd = isForcedEndToken(o);
boolean inMultiTokenExpr = false;
boolean discardToken = false;
if (o instanceof CoreMap) {
// Hacky stuff to ensure sentence breaks do not happen in certain cases
CoreMap cm = (CoreMap) o;
Boolean forcedUntilEndValue = cm.get(CoreAnnotations.ForcedSentenceUntilEndAnnotation.class);
if (!forcedEnd) {
if (forcedUntilEndValue != null && forcedUntilEndValue)
inWaitForForcedEnd = true;
else {
MultiTokenTag mt = cm.get(CoreAnnotations.MentionTokenAnnotation.class);
if (mt != null && !mt.isEnd()) {
// In the middle of a multi token mention, make sure sentence is not ended here
inMultiTokenExpr = true;
}
}
}
}
if (tokenPatternsToDiscard != null) {
discardToken = matchesTokenPatternsToDiscard(word);
}
if (sentenceRegionBeginPattern != null && !insideRegion) {
if (DEBUG) {
log.info("Word is " + word + "; outside region; deleted");
}
if (sentenceRegionBeginPattern.matcher(word).matches()) {
insideRegion = true;
if (DEBUG) {
log.info(" entering region");
}
}
lastTokenWasNewline = false;
continue;
}
if (lastSentence != null && currentSentence.isEmpty() && sentenceBoundaryFollowersPattern.matcher(word).matches()) {
if (!discardToken) {
lastSentence.add(o);
}
if (DEBUG) {
log.info("Word is " + word + (discardToken ? "discarded" : " added to last sentence"));
}
lastTokenWasNewline = false;
continue;
}
boolean newSent = false;
String debugText = (discardToken) ? "discarded" : "added to current";
if (inWaitForForcedEnd && !forcedEnd) {
if (!discardToken)
currentSentence.add(o);
if (DEBUG) {
log.info("Word is " + word + "; is in wait for forced end; " + debugText);
}
} else if (inMultiTokenExpr && !forcedEnd) {
if (!discardToken)
currentSentence.add(o);
if (DEBUG) {
log.info("Word is " + word + "; is in multi token expr; " + debugText);
}
} else if (sentenceBoundaryToDiscard.contains(word)) {
if (newlineIsSentenceBreak == NewlineIsSentenceBreak.ALWAYS) {
newSent = true;
} else if (newlineIsSentenceBreak == NewlineIsSentenceBreak.TWO_CONSECUTIVE) {
if (lastTokenWasNewline) {
newSent = true;
}
}
lastTokenWasNewline = true;
if (DEBUG) {
log.info("Word is " + word + " discarded sentence boundary");
}
} else {
lastTokenWasNewline = false;
Boolean isb;
if (xmlBreakElementsToDiscard != null && matchesXmlBreakElementToDiscard(word)) {
newSent = true;
if (DEBUG) {
log.info("Word is " + word + "; is XML break element; discarded");
}
} else if (sentenceRegionEndPattern != null && sentenceRegionEndPattern.matcher(word).matches()) {
insideRegion = false;
newSent = true;
// Marked sentence boundaries
} else if ((isSentenceBoundary != null) && ((isb = isSentenceBoundary.get(o)) != null) && isb) {
if (!discardToken)
currentSentence.add(o);
if (DEBUG) {
log.info("Word is " + word + "; is sentence boundary (matched multi-token pattern); " + debugText);
}
newSent = true;
} else if (sentenceBoundaryTokenPattern.matcher(word).matches()) {
if (!discardToken)
currentSentence.add(o);
if (DEBUG) {
log.info("Word is " + word + "; is sentence boundary; " + debugText);
}
newSent = true;
} else if (forcedEnd) {
if (!discardToken)
currentSentence.add(o);
inWaitForForcedEnd = false;
newSent = true;
if (DEBUG) {
log.info("Word is " + word + "; annotated to be the end of a sentence; " + debugText);
}
} else {
if (!discardToken)
currentSentence.add(o);
if (DEBUG) {
log.info("Word is " + word + "; " + debugText);
}
}
}
if (newSent && (!currentSentence.isEmpty() || allowEmptySentences)) {
if (DEBUG) {
log.info(" beginning new sentence");
}
sentences.add(currentSentence);
// adds this sentence now that it's complete
lastSentence = currentSentence;
// clears the current sentence
currentSentence = new ArrayList<>();
}
}
// terminator at the end of file
if (!currentSentence.isEmpty()) {
// adds last sentence
sentences.add(currentSentence);
}
return sentences;
}
use of edu.stanford.nlp.ling.MultiTokenTag in project CoreNLP by stanfordnlp.
the class RuleBasedCorefMentionFinder method extractPremarkedEntityMentions.
protected static void extractPremarkedEntityMentions(CoreMap s, List<Mention> mentions, Set<IntPair> mentionSpanSet, Set<IntPair> namedEntitySpanSet) {
List<CoreLabel> sent = s.get(CoreAnnotations.TokensAnnotation.class);
SemanticGraph dependency = s.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
int beginIndex = -1;
for (CoreLabel w : sent) {
MultiTokenTag t = w.get(CoreAnnotations.MentionTokenAnnotation.class);
if (t != null) {
// Part of a mention
if (t.isStart()) {
// Start of mention
beginIndex = w.get(CoreAnnotations.IndexAnnotation.class) - 1;
}
if (t.isEnd()) {
// end of mention
int endIndex = w.get(CoreAnnotations.IndexAnnotation.class);
if (beginIndex >= 0) {
IntPair mSpan = new IntPair(beginIndex, endIndex);
int dummyMentionId = -1;
Mention m = new Mention(dummyMentionId, beginIndex, endIndex, dependency, new ArrayList<>(sent.subList(beginIndex, endIndex)));
mentions.add(m);
mentionSpanSet.add(mSpan);
beginIndex = -1;
} else {
SieveCoreferenceSystem.logger.warning("Start of marked mention not found in sentence: " + t + " at tokenIndex=" + (w.get(CoreAnnotations.IndexAnnotation.class) - 1) + " for " + s.get(CoreAnnotations.TextAnnotation.class));
}
}
}
}
}
use of edu.stanford.nlp.ling.MultiTokenTag in project CoreNLP by stanfordnlp.
the class CleanXmlAnnotator method process.
public List<CoreLabel> process(Annotation annotation, List<CoreLabel> tokens) {
// As we are processing, this stack keeps track of which tags we
// are currently inside
Stack<String> enclosingTags = new Stack<>();
// here we keep track of the current enclosingTags
// this lets multiple tokens reuse the same tag stack
List<String> currentTagSet = null;
// How many matching tags we've seen
int matchDepth = 0;
// stores the filtered tags as we go
List<CoreLabel> newTokens = new ArrayList<>();
// we use this to store the before & after annotations if the
// tokens were tokenized for "invertible"
StringBuilder removedText = new StringBuilder();
// we keep track of this so we can look at the last tag after
// we're outside the loop
// Keeps track of what we still need to doc level annotations
// we still need to look for
Set<Class> toAnnotate = new HashSet<>(docAnnotationPatterns.keySet());
int utteranceIndex = 0;
boolean inUtterance = false;
boolean inSpeakerTag = false;
String currentSpeaker = null;
List<CoreLabel> speakerTokens = new ArrayList<>();
List<CoreLabel> docDateTokens = new ArrayList<>();
List<CoreLabel> docTypeTokens = new ArrayList<>();
List<CoreLabel> docIdTokens = new ArrayList<>();
// Local variables for additional per token annotations
CoreMap tokenAnnotations = (tokenAnnotationPatterns != null && !tokenAnnotationPatterns.isEmpty()) ? new ArrayCoreMap() : null;
Map<Class, Stack<Pair<String, String>>> savedTokenAnnotations = new ArrayMap<>();
// Local variable for annotating sections
XMLUtils.XMLTag sectionStartTag = null;
CoreLabel sectionStartToken = null;
CoreMap sectionAnnotations = null;
Map<Class, List<CoreLabel>> savedTokensForSection = new HashMap<>();
boolean markSingleSentence = false;
for (CoreLabel token : tokens) {
String word = token.word().trim();
XMLUtils.XMLTag tag = XMLUtils.parseTag(word);
// If it's not a tag, we do manipulations such as unescaping
if (tag == null) {
// TODO: put this into the lexer instead of here
token.setWord(XMLUtils.unescapeStringForXML(token.word()));
// TODO: was there another annotation that also represents the word?
if (matchDepth > 0 || xmlTagMatcher == null || xmlTagMatcher.matcher("").matches()) {
newTokens.add(token);
if (inUtterance) {
token.set(CoreAnnotations.UtteranceAnnotation.class, utteranceIndex);
if (currentSpeaker != null)
token.set(CoreAnnotations.SpeakerAnnotation.class, currentSpeaker);
}
if (markSingleSentence) {
token.set(CoreAnnotations.ForcedSentenceUntilEndAnnotation.class, true);
markSingleSentence = false;
}
if (tokenAnnotations != null) {
ChunkAnnotationUtils.copyUnsetAnnotations(tokenAnnotations, token);
}
}
// what we removed to the appropriate tokens
if (removedText.length() > 0) {
boolean added = false;
String before = token.get(CoreAnnotations.BeforeAnnotation.class);
if (before != null) {
token.set(CoreAnnotations.BeforeAnnotation.class, removedText + before);
added = true;
}
if (added && newTokens.size() > 1) {
CoreLabel previous = newTokens.get(newTokens.size() - 2);
String after = previous.get(CoreAnnotations.AfterAnnotation.class);
if (after != null)
previous.set(CoreAnnotations.AfterAnnotation.class, after + removedText);
else
previous.set(CoreAnnotations.AfterAnnotation.class, removedText.toString());
}
removedText = new StringBuilder();
}
if (currentTagSet == null) {
// We wrap the list in an unmodifiable list because we reuse
// the same list object many times. We don't want to
// let someone modify one list and screw up all the others.
currentTagSet = Collections.unmodifiableList(new ArrayList<>(enclosingTags));
}
token.set(CoreAnnotations.XmlContextAnnotation.class, currentTagSet);
// is this token part of the doc date sequence?
if (dateTagMatcher != null && !currentTagSet.isEmpty() && dateTagMatcher.matcher(currentTagSet.get(currentTagSet.size() - 1)).matches()) {
docDateTokens.add(token);
}
if (docIdTagMatcher != null && !currentTagSet.isEmpty() && docIdTagMatcher.matcher(currentTagSet.get(currentTagSet.size() - 1)).matches()) {
docIdTokens.add(token);
}
if (docTypeTagMatcher != null && !currentTagSet.isEmpty() && docTypeTagMatcher.matcher(currentTagSet.get(currentTagSet.size() - 1)).matches()) {
docTypeTokens.add(token);
}
if (inSpeakerTag) {
speakerTokens.add(token);
}
if (sectionStartTag != null) {
boolean okay = true;
if (ssplitDiscardTokensMatcher != null) {
okay = !ssplitDiscardTokensMatcher.matcher(token.word()).matches();
}
if (okay) {
if (sectionStartToken == null) {
sectionStartToken = token;
}
// Add tokens to saved section tokens
for (List<CoreLabel> saved : savedTokensForSection.values()) {
saved.add(token);
}
}
}
continue;
}
// At this point, we know we have a tag
// we are removing a token and its associated text...
// keep track of that
String currentRemoval = token.get(CoreAnnotations.BeforeAnnotation.class);
if (currentRemoval != null)
removedText.append(currentRemoval);
currentRemoval = token.get(CoreAnnotations.OriginalTextAnnotation.class);
if (currentRemoval != null)
removedText.append(currentRemoval);
if (token == tokens.get(tokens.size() - 1)) {
currentRemoval = token.get(CoreAnnotations.AfterAnnotation.class);
if (currentRemoval != null)
removedText.append(currentRemoval);
}
// Check if we want to annotate anything using the tags's attributes
if (!toAnnotate.isEmpty() && tag.attributes != null) {
Set<Class> foundAnnotations = annotateWithTag(annotation, annotation, tag, docAnnotationPatterns, null, toAnnotate, null);
toAnnotate.removeAll(foundAnnotations);
}
// Check if the tag matches a section
if (sectionTagMatcher != null && sectionTagMatcher.matcher(tag.name).matches()) {
if (tag.isEndTag) {
annotateWithTag(annotation, sectionAnnotations, tag, sectionAnnotationPatterns, savedTokensForSection, null, null);
if (sectionStartToken != null) {
sectionStartToken.set(CoreAnnotations.SectionStartAnnotation.class, sectionAnnotations);
}
// Mark previous token as forcing sentence and section end
if (!newTokens.isEmpty()) {
CoreLabel previous = newTokens.get(newTokens.size() - 1);
previous.set(CoreAnnotations.ForcedSentenceEndAnnotation.class, true);
previous.set(CoreAnnotations.SectionEndAnnotation.class, sectionStartTag.name);
}
savedTokensForSection.clear();
sectionStartTag = null;
sectionStartToken = null;
sectionAnnotations = null;
} else if (!tag.isSingleTag) {
// Prepare to mark first token with section information
sectionStartTag = tag;
sectionAnnotations = new ArrayCoreMap();
sectionAnnotations.set(CoreAnnotations.SectionAnnotation.class, sectionStartTag.name);
}
}
if (sectionStartTag != null) {
// store away annotations for section
annotateWithTag(annotation, sectionAnnotations, tag, sectionAnnotationPatterns, savedTokensForSection, null, null);
}
if (tokenAnnotations != null) {
annotateWithTag(annotation, tokenAnnotations, tag, tokenAnnotationPatterns, null, null, savedTokenAnnotations);
}
// to end the sentence.
if (sentenceEndingTagMatcher != null && sentenceEndingTagMatcher.matcher(tag.name).matches() && !newTokens.isEmpty()) {
CoreLabel previous = newTokens.get(newTokens.size() - 1);
previous.set(CoreAnnotations.ForcedSentenceEndAnnotation.class, true);
}
if (utteranceTurnTagMatcher != null && utteranceTurnTagMatcher.matcher(tag.name).matches()) {
if (!newTokens.isEmpty()) {
// Utterance turn is also sentence ending
CoreLabel previous = newTokens.get(newTokens.size() - 1);
previous.set(CoreAnnotations.ForcedSentenceEndAnnotation.class, true);
}
inUtterance = !(tag.isEndTag || tag.isSingleTag);
if (inUtterance) {
utteranceIndex++;
}
if (!inUtterance) {
currentSpeaker = null;
}
}
if (speakerTagMatcher != null && speakerTagMatcher.matcher(tag.name).matches()) {
if (!newTokens.isEmpty()) {
// Speaker is not really part of sentence
CoreLabel previous = newTokens.get(newTokens.size() - 1);
previous.set(CoreAnnotations.ForcedSentenceEndAnnotation.class, true);
}
inSpeakerTag = !(tag.isEndTag || tag.isSingleTag);
if (tag.isEndTag) {
currentSpeaker = tokensToString(annotation, speakerTokens);
MultiTokenTag.Tag mentionTag = new MultiTokenTag.Tag(currentSpeaker, "Speaker", speakerTokens.size());
int i = 0;
for (CoreLabel t : speakerTokens) {
t.set(CoreAnnotations.SpeakerAnnotation.class, currentSpeaker);
t.set(CoreAnnotations.MentionTokenAnnotation.class, new MultiTokenTag(mentionTag, i));
i++;
}
} else {
currentSpeaker = null;
}
speakerTokens.clear();
}
if (singleSentenceTagMatcher != null && singleSentenceTagMatcher.matcher(tag.name).matches()) {
if (tag.isEndTag) {
// Mark previous token as forcing sentence end
if (!newTokens.isEmpty()) {
CoreLabel previous = newTokens.get(newTokens.size() - 1);
previous.set(CoreAnnotations.ForcedSentenceEndAnnotation.class, true);
}
markSingleSentence = false;
} else if (!tag.isSingleTag) {
// Enforce rest of the tokens to be single token until ForceSentenceEnd is seen
markSingleSentence = true;
}
}
if (xmlTagMatcher == null)
continue;
if (tag.isSingleTag) {
continue;
}
// at this point, we can't reuse the "currentTagSet" vector
// any more, since the current tag set has changed
currentTagSet = null;
if (tag.isEndTag) {
while (true) {
if (enclosingTags.isEmpty()) {
throw new IllegalArgumentException("Got a close tag " + tag.name + " which does not match" + " any open tag");
}
String lastTag = enclosingTags.pop();
if (xmlTagMatcher.matcher(lastTag).matches()) {
--matchDepth;
}
if (lastTag.equals(tag.name))
break;
if (!allowFlawedXml)
throw new IllegalArgumentException("Mismatched tags... " + tag.name + " closed a " + lastTag + " tag.");
}
if (matchDepth < 0) {
// the tags match up correctly
throw new AssertionError("Programming error? We think there " + "have been more close tags than open tags");
}
} else {
// open tag, since all other cases are exhausted
enclosingTags.push(tag.name);
if (xmlTagMatcher.matcher(tag.name).matches())
matchDepth++;
}
}
if (!enclosingTags.isEmpty() && !allowFlawedXml) {
throw new IllegalArgumentException("Unclosed tags, starting with " + enclosingTags.pop());
}
// annotation of the xml tag we threw away
if (!newTokens.isEmpty() && removedText.length() > 0) {
CoreLabel lastToken = newTokens.get(newTokens.size() - 1);
// is only non-null if we are invertible. Hopefully.
if (lastToken.get(CoreAnnotations.OriginalTextAnnotation.class) != null) {
lastToken.set(CoreAnnotations.AfterAnnotation.class, removedText.toString());
}
}
// Populate docid, docdate, doctype
if (annotation != null) {
if (!docIdTokens.isEmpty()) {
String str = tokensToString(annotation, docIdTokens).trim();
annotation.set(CoreAnnotations.DocIDAnnotation.class, str);
}
if (!docDateTokens.isEmpty()) {
String str = tokensToString(annotation, docDateTokens).trim();
annotation.set(CoreAnnotations.DocDateAnnotation.class, str);
}
if (!docTypeTokens.isEmpty()) {
String str = tokensToString(annotation, docTypeTokens).trim();
annotation.set(CoreAnnotations.DocTypeAnnotation.class, str);
}
}
return newTokens;
}
use of edu.stanford.nlp.ling.MultiTokenTag in project CoreNLP by stanfordnlp.
the class CorefMentionFinder method extractPremarkedEntityMentions.
protected static void extractPremarkedEntityMentions(CoreMap s, List<Mention> mentions, Set<IntPair> mentionSpanSet, Set<IntPair> namedEntitySpanSet) {
List<CoreLabel> sent = s.get(CoreAnnotations.TokensAnnotation.class);
SemanticGraph basicDependency = s.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
SemanticGraph enhancedDependency = s.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
if (enhancedDependency == null) {
enhancedDependency = s.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
}
int beginIndex = -1;
for (CoreLabel w : sent) {
MultiTokenTag t = w.get(CoreAnnotations.MentionTokenAnnotation.class);
if (t != null) {
// Part of a mention
if (t.isStart()) {
// Start of mention
beginIndex = w.get(CoreAnnotations.IndexAnnotation.class) - 1;
}
if (t.isEnd()) {
// end of mention
int endIndex = w.get(CoreAnnotations.IndexAnnotation.class);
if (beginIndex >= 0) {
IntPair mSpan = new IntPair(beginIndex, endIndex);
int dummyMentionId = -1;
Mention m = new Mention(dummyMentionId, beginIndex, endIndex, sent, basicDependency, enhancedDependency, new ArrayList<>(sent.subList(beginIndex, endIndex)));
mentions.add(m);
mentionSpanSet.add(mSpan);
beginIndex = -1;
} else {
Redwood.log("Start of marked mention not found in sentence: " + t + " at tokenIndex=" + (w.get(CoreAnnotations.IndexAnnotation.class) - 1) + " for " + s.get(CoreAnnotations.TextAnnotation.class));
}
}
}
}
}
Aggregations