use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent in project cogcomp-nlp by CogComp.
the class StatefullTokenizerTest method testCharacterOffsetToTokenIndex.
/**
* test whether the mapping between character offset and token index is correct.
*/
@Test
public void testCharacterOffsetToTokenIndex() {
String normal = "The ordinary sample.\n\nDon't mess things up.";
String leadingWaste = "<ignoreme>wastedspace</ignoreme>";
String postWaste = " \n<ignoremetoo>aaaargh</ignoremetoo>";
String other = leadingWaste + normal + postWaste;
TextAnnotationBuilder tabldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
TextAnnotation taNormal = tabldr.createTextAnnotation("test", "normal", normal);
List<Constituent> normalToks = taNormal.getView(ViewNames.TOKENS).getConstituents();
assertEquals(13, normalToks.get(2).getStartCharOffset());
assertEquals(24, normalToks.get(5).getStartCharOffset());
int ignoreUpToOffset = leadingWaste.length();
IntPair[] characterOffsets = new IntPair[10];
String[] tokens = taNormal.getTokens();
for (int i = 0; i < normalToks.size(); ++i) {
Constituent t = normalToks.get(i);
characterOffsets[i] = new IntPair(ignoreUpToOffset + t.getStartCharOffset(), ignoreUpToOffset + t.getEndCharOffset());
}
List<Constituent> sentences = taNormal.getView(ViewNames.SENTENCE).getConstituents();
int[] sentenceEndPositions = new int[sentences.size()];
for (int i = 0; i < sentences.size(); ++i) {
Constituent s = sentences.get(i);
sentenceEndPositions[i] = s.getEndSpan();
}
// all info should be same except initial char offsets of tokens ignore spans of text
TextAnnotation taOther = new TextAnnotation("test", "other", other, characterOffsets, tokens, sentenceEndPositions);
List<Constituent> otherToks = taOther.getView(ViewNames.TOKENS).getConstituents();
int thirdTokNormalStart = normalToks.get(2).getStartCharOffset();
int thirdTokOtherStart = otherToks.get(2).getStartCharOffset();
assertEquals(thirdTokOtherStart, (thirdTokNormalStart + leadingWaste.length()));
int eighthTokNormalStart = normalToks.get(8).getStartCharOffset();
int eighthTokOtherStart = otherToks.get(8).getStartCharOffset();
assertEquals(eighthTokOtherStart, (eighthTokNormalStart + leadingWaste.length()));
int meaninglessStartOffset = taOther.getTokenIdFromCharacterOffset(2);
assertEquals(-1, meaninglessStartOffset);
int meaninglessPastEndOffset = taOther.getTokenIdFromCharacterOffset(leadingWaste.length() + normal.length() + 5);
assertEquals(-1, meaninglessPastEndOffset);
int meaninglessInBetweenToksOffset = taNormal.getTokenIdFromCharacterOffset(20);
assertEquals(-1, meaninglessInBetweenToksOffset);
}
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent in project cogcomp-nlp by CogComp.
the class TransliterationAnnotatorTest method testTransliterationWorks.
@Test
public void testTransliterationWorks() throws AnnotatorException {
TextAnnotation ta = DummyTextAnnotationGenerator.generateAnnotatedTextAnnotation(false, 3);
annotator.getView(ta);
assertEquals(true, ta.hasView(ViewNames.TRANSLITERATION));
List<Constituent> consList = ta.getView(ViewNames.TRANSLITERATION).getConstituents();
boolean hasJohn = false;
for (Constituent c : consList) {
// Persian transliteration of "John"
if (c.getLabel().contains("جان"))
hasJohn = true;
}
assertTrue(hasJohn);
}
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent in project cogcomp-nlp by CogComp.
the class PredicateDetector method getLemma.
public Option<String> getLemma(TextAnnotation ta, int tokenId) {
String pos = WordHelpers.getPOS(ta, tokenId);
String token = ta.getToken(tokenId).toLowerCase();
String lemma = WordHelpers.getLemma(ta, tokenId);
boolean predicate = false;
// any token that is a verb is a predicate
if (POSUtils.isPOSVerb(pos) && !pos.equals("AUX")) {
if (token.equals("'s") || token.equals("'re") || token.equals("'m"))
lemma = "be";
else if (token.equals("'d") || lemma.equals("wo") || lemma.equals("'ll"))
lemma = "xmodal";
predicate = !(lemma.equals("xmodal") || pos.equals("MD") || token.equals("'ve"));
// ignore all instances of has + "to be" if they are followed by a
// verb or if the token is "be" followed by a verb
boolean doVerb = lemma.equals("do");
boolean be = lemma.equals("be");
boolean have = lemma.equals("have");
if (tokenId < ta.size() - 1) {
if (be) {
SpanLabelView chunk = (SpanLabelView) ta.getView(ViewNames.SHALLOW_PARSE);
for (Constituent c : chunk.getConstituentsCoveringToken(tokenId)) {
// token, then there is another verb here
if (c.getEndSpan() - 1 != tokenId) {
predicate = false;
break;
}
}
}
// ignore "have + be"
if (have && WordHelpers.getLemma(ta, tokenId + 1).equals("be")) {
predicate = false;
}
// ignore "have/do + verb"
if ((have || doVerb) && POSUtils.isPOSVerb(WordHelpers.getPOS(ta, tokenId + 1)))
predicate = false;
if (token.equals("according") && ta.getToken(tokenId + 1).toLowerCase().equals("to"))
predicate = false;
}
if (tokenId < ta.size() - 2) {
// ignore don't + V or haven't + V
if (doVerb || have) {
String nextToken = ta.getToken(tokenId + 1).toLowerCase();
if ((nextToken.equals("n't") || nextToken.equals("not")) && POSUtils.isPOSVerb(WordHelpers.getPOS(ta, tokenId + 2)))
predicate = false;
}
}
} else if (token.startsWith("re-")) {
String trim = token.replace("re-", "");
predicate = WordNetPlusLemmaViewGenerator.lemmaDict.contains(trim);
}
if (predicate) {
return new Option<>(lemma);
} else {
return Option.empty();
}
}
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent in project cogcomp-nlp by CogComp.
the class PredicateDetector method getPredicates.
public List<Constituent> getPredicates(TextAnnotation ta) throws Exception {
List<Constituent> list = new ArrayList<>();
for (int i = 0; i < ta.size(); i++) {
Option<String> opt = getLemma(ta, i);
if (opt.isPresent()) {
Constituent c = new Constituent("", "", ta, i, i + 1);
c.addAttribute(PredicateArgumentView.LemmaIdentifier, opt.get());
list.add(c);
}
}
return list;
}
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent in project cogcomp-nlp by CogComp.
the class EREDocumentReader method createAndAddXmlMarkupAnnotations.
/**
* create a view with constituents representing post boundaries and quotations.
* For each constituent, the label is the span type; attribute AUTHOR specifies the post or quote author name,
* and attributes NAME_START and NAME_END specify the name offsets in the original xml text
*
* @param xmlTa an XmlTextAnnotation containing information to use for an POST_ERE view.
*/
private void createAndAddXmlMarkupAnnotations(XmlTextAnnotation xmlTa) {
List<XmlDocumentProcessor.SpanInfo> markup = xmlTa.getXmlMarkup();
TextAnnotation ta = xmlTa.getTextAnnotation();
View postView = new View(getPostViewName(), NAME, ta, 1.0);
for (XmlDocumentProcessor.SpanInfo spanInfo : markup) {
String label = spanInfo.label;
Pair<String, IntPair> authorInfo = null;
boolean isPost = false;
if (POST.equals(label)) {
isPost = true;
authorInfo = spanInfo.attributes.get(AUTHOR);
} else if (QUOTE.equals(label)) {
isPost = true;
authorInfo = spanInfo.attributes.get(ORIG_AUTHOR);
}
if (isPost) {
IntPair cleanTextOffsets = new IntPair(xmlTa.getXmlSt().computeModifiedOffsetFromOriginal(spanInfo.spanOffsets.getFirst()), xmlTa.getXmlSt().computeModifiedOffsetFromOriginal(spanInfo.spanOffsets.getSecond()));
if (-1 == cleanTextOffsets.getFirst() || -1 == cleanTextOffsets.getSecond())
throw new IllegalStateException("could not compute cleanText offsets for " + label + " span with offsets " + spanInfo.spanOffsets.getFirst() + ", " + spanInfo.spanOffsets.getSecond());
int tokStart = ta.getTokenIdFromCharacterOffset(cleanTextOffsets.getFirst());
int tokEnd = ta.getTokenIdFromCharacterOffset(cleanTextOffsets.getSecond());
assert (tokStart >= 0 && tokEnd >= 0 && tokEnd > tokStart);
Constituent c = new Constituent(label, getPostViewName(), ta, tokStart, tokEnd);
if (null != authorInfo) {
c.addAttribute(AUTHOR, authorInfo.getFirst());
c.addAttribute(NAME_START, Integer.toString(authorInfo.getSecond().getFirst()));
c.addAttribute(NAME_END, Integer.toString(authorInfo.getSecond().getSecond()));
postView.addConstituent(c);
}
}
}
if (!postView.getConstituents().isEmpty())
ta.addView(getPostViewName(), postView);
}
Aggregations