use of edu.stanford.nlp.util.CoreMap in project CoreNLP by stanfordnlp.
the class TokenSequenceMatcherITest method testTokenSequenceMatcherPosNNP.
public void testTokenSequenceMatcherPosNNP() throws IOException {
CoreMap doc = createDocument(testText1);
// Test sequence with groups
TokenSequencePattern p = TokenSequencePattern.compile("[ { tag:\"NNP\" } ]+");
TokenSequenceMatcher m = p.getMatcher(doc.get(CoreAnnotations.TokensAnnotation.class));
boolean match = m.find();
assertTrue(match);
assertEquals(0, m.groupCount());
assertEquals("Mellitus", m.group());
p = TokenSequencePattern.compile("[ { tag:\"NNP\" } ] [ /is|was/ ] []*? [ { tag:\"NNP\" } ]+ ");
m = p.getMatcher(doc.get(CoreAnnotations.TokensAnnotation.class));
match = m.find();
assertTrue(match);
assertEquals(0, m.groupCount());
assertEquals("Mellitus was the first Bishop", m.group());
TokenSequencePattern nnpPattern = TokenSequencePattern.compile("[ { tag:\"NNP\" } ]");
Env env = TokenSequencePattern.getNewEnv();
env.bind("$NNP", nnpPattern);
p = TokenSequencePattern.compile(env, " $NNP [ /is|was/ ] []*? $NNP+ [ \"of\" ] $NNP+ ");
m = p.getMatcher(doc.get(CoreAnnotations.TokensAnnotation.class));
match = m.find();
assertTrue(match);
assertEquals(0, m.groupCount());
assertEquals("Mellitus was the first Bishop of London", m.group());
p = TokenSequencePattern.compile(env, " ($NNP) /is|was/ []*? ($NNP)+ \"of\" ($NNP)+ ");
m = p.getMatcher(doc.get(CoreAnnotations.TokensAnnotation.class));
match = m.find();
assertTrue(match);
assertEquals(3, m.groupCount());
assertEquals("Mellitus was the first Bishop of London", m.group());
assertEquals("Mellitus", m.group(1));
assertEquals("Bishop", m.group(2));
assertEquals("London", m.group(3));
nnpPattern = TokenSequencePattern.compile(" ( [ { tag:\"NNP\" } ] )");
env.bind("$NNP", nnpPattern);
p = TokenSequencePattern.compile(env, " $NNP /is|was/ []*? $NNP+ \"of\" $NNP+ ");
m = p.getMatcher(doc.get(CoreAnnotations.TokensAnnotation.class));
match = m.find();
assertTrue(match);
assertEquals(3, m.groupCount());
assertEquals("Mellitus was the first Bishop of London", m.group());
assertEquals("Mellitus", m.group(1));
assertEquals("Bishop", m.group(2));
assertEquals("London", m.group(3));
// Same as above but without extra "{}"
nnpPattern = TokenSequencePattern.compile(" ( [ tag:\"NNP\" ] )");
env.bind("$NNP", nnpPattern);
p = TokenSequencePattern.compile(env, " $NNP /is|was/ []*? $NNP+ \"of\" $NNP+ ");
m = p.getMatcher(doc.get(CoreAnnotations.TokensAnnotation.class));
match = m.find();
assertTrue(match);
assertEquals(3, m.groupCount());
assertEquals("Mellitus was the first Bishop of London", m.group());
assertEquals("Mellitus", m.group(1));
assertEquals("Bishop", m.group(2));
assertEquals("London", m.group(3));
// Same as above but using "pos"
nnpPattern = TokenSequencePattern.compile(" ( [ pos:\"NNP\" ] )");
env.bind("$NNP", nnpPattern);
p = TokenSequencePattern.compile(env, " $NNP /is|was/ []*? $NNP+ \"of\" $NNP+ ");
m = p.getMatcher(doc.get(CoreAnnotations.TokensAnnotation.class));
match = m.find();
assertTrue(match);
assertEquals(3, m.groupCount());
assertEquals("Mellitus was the first Bishop of London", m.group());
assertEquals("Mellitus", m.group(1));
assertEquals("Bishop", m.group(2));
assertEquals("London", m.group(3));
}
use of edu.stanford.nlp.util.CoreMap in project CoreNLP by stanfordnlp.
the class TokenSequenceMatcherITest method testTokenSequenceMatcher8.
public void testTokenSequenceMatcher8() throws IOException {
CoreMap doc = createDocument(testText1);
// Test sequence with groups
TokenSequencePattern p = TokenSequencePattern.compile("[ /[A-Za-z]+/ ]*");
TokenSequenceMatcher m = p.getMatcher(doc.get(CoreAnnotations.TokensAnnotation.class));
boolean match = m.find();
assertTrue(match);
assertEquals(0, m.groupCount());
assertEquals("Mellitus was the first Bishop of London", m.group());
match = m.find();
assertTrue(match);
assertEquals(0, m.groupCount());
assertEquals("the third Archbishop of Canterbury", m.group());
p = TokenSequencePattern.compile("[ /[A-Za-z]+/ ]* [\"Mellitus\"] [ \"was\"]");
m = p.getMatcher(doc.get(CoreAnnotations.TokensAnnotation.class));
match = m.find();
assertTrue(match);
assertEquals(0, m.groupCount());
assertEquals("Mellitus was", m.group());
match = m.find();
assertFalse(match);
p = TokenSequencePattern.compile("[ /[A-Za-z]+/ ]+ [\"Mellitus\"] [ \"was\"]");
m = p.getMatcher(doc.get(CoreAnnotations.TokensAnnotation.class));
match = m.find();
assertFalse(match);
}
use of edu.stanford.nlp.util.CoreMap in project CoreNLP by stanfordnlp.
the class TokenSequenceMatcherITest method testTokenSequenceMatchesWildcard.
public void testTokenSequenceMatchesWildcard() throws IOException {
CoreMap doc = createDocument("word1 word2");
// Test sequence with groups
TokenSequencePattern p = TokenSequencePattern.compile("[]{2}|[]");
TokenSequenceMatcher m = p.getMatcher(doc.get(CoreAnnotations.TokensAnnotation.class));
boolean matches = m.matches();
assertTrue(matches);
// Reverse order
p = TokenSequencePattern.compile("[]|[]{2}");
m = p.getMatcher(doc.get(CoreAnnotations.TokensAnnotation.class));
matches = m.matches();
assertTrue(matches);
// Using {1,2}
p = TokenSequencePattern.compile("[]{1,2}");
m = p.getMatcher(doc.get(CoreAnnotations.TokensAnnotation.class));
matches = m.matches();
assertTrue(matches);
}
use of edu.stanford.nlp.util.CoreMap in project CoreNLP by stanfordnlp.
the class TokenSequenceMatcherITest method testTokenSequenceMatcherValue.
public void testTokenSequenceMatcherValue() throws IOException {
CoreMap doc = createDocument(testText);
// Test simple sequence with value
TokenSequencePattern p = TokenSequencePattern.compile(getOrPatternExpr(new Pair<String, Object>("one", 1), new Pair<String, Object>("two", null), new Pair<String, Object>("fifty", 50)));
TokenSequenceMatcher m = p.getMatcher(doc.get(CoreAnnotations.TokensAnnotation.class));
boolean match = m.find();
assertTrue(match);
assertEquals("one", m.group());
assertEquals(1, m.groupValue());
match = m.find();
assertTrue(match);
assertEquals("two", m.group());
assertNull(m.groupValue());
match = m.find();
assertTrue(match);
assertEquals("fifty", m.group());
assertEquals(50, m.groupValue());
match = m.find();
assertFalse(match);
}
use of edu.stanford.nlp.util.CoreMap in project CoreNLP by stanfordnlp.
the class WordsToSentencesAnnotator method annotate.
/**
* If setCountLineNumbers is set to true, we count line numbers by
* telling the underlying splitter to return empty lists of tokens
* and then treating those empty lists as empty lines. We don't
* actually include empty sentences in the annotation, though.
**/
@Override
public void annotate(Annotation annotation) {
if (VERBOSE) {
log.info("Sentence splitting ...");
}
if (!annotation.containsKey(CoreAnnotations.TokensAnnotation.class)) {
throw new IllegalArgumentException("WordsToSentencesAnnotator: unable to find words/tokens in: " + annotation);
}
// get text and tokens from the document
String text = annotation.get(CoreAnnotations.TextAnnotation.class);
List<CoreLabel> tokens = annotation.get(CoreAnnotations.TokensAnnotation.class);
String docID = annotation.get(CoreAnnotations.DocIDAnnotation.class);
// log.info("Tokens are: " + tokens);
// assemble the sentence annotations
int tokenOffset = 0;
int lineNumber = 0;
// section annotations to mark sentences with
CoreMap sectionAnnotations = null;
List<CoreMap> sentences = new ArrayList<>();
for (List<CoreLabel> sentenceTokens : wts.process(tokens)) {
if (countLineNumbers) {
++lineNumber;
}
if (sentenceTokens.isEmpty()) {
if (!countLineNumbers) {
throw new IllegalStateException("unexpected empty sentence: " + sentenceTokens);
} else {
continue;
}
}
// get the sentence text from the first and last character offsets
int begin = sentenceTokens.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
int last = sentenceTokens.size() - 1;
int end = sentenceTokens.get(last).get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
String sentenceText = text.substring(begin, end);
// create a sentence annotation with text and token offsets
Annotation sentence = new Annotation(sentenceText);
sentence.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, begin);
sentence.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, end);
sentence.set(CoreAnnotations.TokensAnnotation.class, sentenceTokens);
sentence.set(CoreAnnotations.TokenBeginAnnotation.class, tokenOffset);
tokenOffset += sentenceTokens.size();
sentence.set(CoreAnnotations.TokenEndAnnotation.class, tokenOffset);
sentence.set(CoreAnnotations.SentenceIndexAnnotation.class, sentences.size());
if (countLineNumbers) {
sentence.set(CoreAnnotations.LineNumberAnnotation.class, lineNumber);
}
// Annotate sentence with section information.
// Assume section start and end appear as first and last tokens of sentence
CoreLabel sentenceStartToken = sentenceTokens.get(0);
CoreLabel sentenceEndToken = sentenceTokens.get(sentenceTokens.size() - 1);
CoreMap sectionStart = sentenceStartToken.get(CoreAnnotations.SectionStartAnnotation.class);
if (sectionStart != null) {
// Section is started
sectionAnnotations = sectionStart;
}
if (sectionAnnotations != null) {
// transfer annotations over to sentence
ChunkAnnotationUtils.copyUnsetAnnotations(sectionAnnotations, sentence);
}
String sectionEnd = sentenceEndToken.get(CoreAnnotations.SectionEndAnnotation.class);
if (sectionEnd != null) {
sectionAnnotations = null;
}
if (docID != null) {
sentence.set(CoreAnnotations.DocIDAnnotation.class, docID);
}
int index = 1;
for (CoreLabel token : sentenceTokens) {
token.setIndex(index++);
token.setSentIndex(sentences.size());
if (docID != null) {
token.setDocID(docID);
}
}
// add the sentence to the list
sentences.add(sentence);
}
// the condition below is possible if sentenceBoundaryToDiscard is initialized!
/*
if (tokenOffset != tokens.size()) {
throw new RuntimeException(String.format(
"expected %d tokens, found %d", tokens.size(), tokenOffset));
}
*/
// add the sentences annotations to the document
annotation.set(CoreAnnotations.SentencesAnnotation.class, sentences);
}
Aggregations