use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class XmlDocumentProcessorTest method testXmlDocumentProcessor.
@Test
public void testXmlDocumentProcessor() {
/*
<doc id="ENG_DF_001241_20150407_F0000007T">
<headline>
cuba
</headline>
<post id="p1" author="chatmasta" datetime="2015-04-07T14:42:00">
*/
Map<String, Set<String>> tagsWithAtts = new HashMap<>();
Set<String> attributeNames = new HashSet<>();
attributeNames.add("author");
attributeNames.add("id");
attributeNames.add("datetime");
tagsWithAtts.put("post", attributeNames);
attributeNames = new HashSet<>();
attributeNames.add("id");
tagsWithAtts.put("doc", attributeNames);
Set<String> deletableSpanTags = new HashSet<>();
deletableSpanTags.add("quote");
deletableSpanTags.add("distraction");
Set<String> tagsToIgnore = new HashSet<>();
tagsToIgnore.add("img");
tagsToIgnore.add("snip");
// StringTransformation origTextSt = new StringTransformation(ORIG_TEXT);
boolean throwExceptionOnXmlTagMiss = true;
XmlDocumentProcessor proc = new XmlDocumentProcessor(deletableSpanTags, tagsWithAtts, tagsToIgnore, throwExceptionOnXmlTagMiss);
Pair<StringTransformation, List<XmlDocumentProcessor.SpanInfo>> nt = proc.processXml(ORIG_TEXT);
// check that we retained the right attributes, cleaned up the text, generated a sensible cleaned text, and can
// recover the offsets of strings in the original text.
StringTransformation st = nt.getFirst();
List<XmlDocumentProcessor.SpanInfo> retainedTagInfo = nt.getSecond();
String cleanText = st.getTransformedText();
assertEquals(ORIG_TEXT, st.getOrigText());
assertEquals(CLEAN_TEXT, cleanText);
// Map<IntPair, String> attrVals = XmlDocumentProcessor.compileAttributeValues(retainedTagInfo);
Map<IntPair, XmlDocumentProcessor.SpanInfo> offsetToSpans = XmlDocumentProcessor.compileOffsetSpanMapping(retainedTagInfo);
assertTrue(offsetToSpans.containsKey(POST_OFFSETS));
XmlDocumentProcessor.SpanInfo spanInfo = offsetToSpans.get(POST_OFFSETS);
assertTrue(spanInfo.attributes.containsKey(AUTHOR));
assertEquals(NAME, spanInfo.attributes.get(AUTHOR).getFirst());
assertEquals(AUTHOR_OFFSETS, spanInfo.attributes.get(AUTHOR).getSecond());
String origAuthStr = st.getOrigText().substring(AUTHOR_OFFSETS.getFirst(), AUTHOR_OFFSETS.getSecond());
assertEquals(NAME, origAuthStr);
assertTrue(offsetToSpans.containsKey(DISTR_OFFSETS));
spanInfo = offsetToSpans.get(DISTR_OFFSETS);
assertTrue(spanInfo.label.equals("distraction"));
assertEquals(DISTR_SUBSTR, ORIG_TEXT.substring(DISTR_OFFSETS.getFirst(), DISTR_OFFSETS.getSecond()));
assertTrue(offsetToSpans.containsKey(IQ_OFFSETS));
int iqStart = st.computeModifiedOffsetFromOriginal(IQ_OFFSETS.getFirst());
int iqEnd = st.computeModifiedOffsetFromOriginal(IQ_OFFSETS.getSecond());
// deleted
assertEquals("", cleanText.substring(iqStart, iqEnd));
assertEquals(ORIG_TEXT.indexOf("Whassup"), IQ_OFFSETS.getFirst());
int doStart = cleanText.indexOf("do?");
int doEnd = doStart + 3;
IntPair origYouOffsets = st.getOriginalOffsets(doStart, doEnd);
assertEquals("do?", ORIG_TEXT.substring(origYouOffsets.getFirst(), origYouOffsets.getSecond()));
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class ParseUtils method getSpanLabeledTree.
private static Pair<Tree<Pair<String, IntPair>>, Integer> getSpanLabeledTree(Tree<String> parseTree, int currentLeafId) {
if (parseTree.isLeaf()) {
IntPair span;
if (ParseTreeProperties.isNullLabel(parseTree.getParent().getLabel())) {
span = new IntPair(currentLeafId, currentLeafId);
} else {
span = new IntPair(currentLeafId, currentLeafId + 1);
currentLeafId++;
}
Pair<String, IntPair> label = new Pair<>(parseTree.getLabel(), span);
Tree<Pair<String, IntPair>> tree = new Tree<>(label);
return new Pair<>(tree, currentLeafId);
}
List<Tree<Pair<String, IntPair>>> children = new ArrayList<>();
int start = currentLeafId;
for (Tree<String> child : parseTree.getChildren()) {
Pair<Tree<Pair<String, IntPair>>, Integer> tmp = getSpanLabeledTree(child, currentLeafId);
currentLeafId = tmp.getSecond();
children.add(tmp.getFirst());
}
int end = currentLeafId;
IntPair span = new IntPair(start, end);
Pair<String, IntPair> label = new Pair<>(parseTree.getLabel(), span);
Tree<Pair<String, IntPair>> output = new Tree<>(label);
for (Tree<Pair<String, IntPair>> child : children) {
output.addSubtree(child);
}
return new Pair<>(output, currentLeafId);
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class ParseUtils method getTokenIndexedParseTreeNodeCovering.
public static Tree<Pair<String, IntPair>> getTokenIndexedParseTreeNodeCovering(String parseViewName, Constituent c) {
// / UGLY CODE ALERT!!!
TextAnnotation ta = c.getTextAnnotation();
int sentenceId = ta.getSentenceId(c);
Tree<String> tree = getParseTree(parseViewName, ta, sentenceId);
final int sentenceStartSpan = ta.getSentence(sentenceId).getStartSpan();
int start = c.getStartSpan() - sentenceStartSpan;
int end = c.getEndSpan() - sentenceStartSpan;
// Find the tree that covers the start and end tokens. However, start
// and end have been shifted relative to the start of the sentence. So
// we need to shift it back, which is why we have that UGLY as sin
// mapper at the end.
Tree<Pair<String, IntPair>> toknTree = getTokenIndexedTreeCovering(tree, start, end);
ITransformer<Tree<Pair<String, IntPair>>, Pair<String, IntPair>> transformer = new ITransformer<Tree<Pair<String, IntPair>>, Pair<String, IntPair>>() {
@Override
public Pair<String, IntPair> transform(Tree<Pair<String, IntPair>> input) {
Pair<String, IntPair> label = input.getLabel();
IntPair newSpan = new IntPair(label.getSecond().getFirst() + sentenceStartSpan, label.getSecond().getSecond() + sentenceStartSpan);
return new Pair<>(label.getFirst(), newSpan);
}
};
return Mappers.mapTree(toknTree, transformer);
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class TextAnnotationUtilitiesTest method test.
/**
* note that this test checks getSubTextAnnotation()'s behavior when a sentence has no constituents in
* a particular view for a particular sentence (view is not generated in sentence level version)
*/
@Test
public void test() {
TextAnnotation ta = DummyTextAnnotationGenerator.generateAnnotatedTextAnnotation(false, 3);
TextAnnotation subTA = TextAnnotationUtilities.getSubTextAnnotation(ta, 2);
assertTrue(subTA.getText().equals("The paving commenced Monday and will finish in June ."));
assertTrue(Objects.equals(subTA.getAvailableViews().toString(), "[SRL_VERB, PSEUDO_PARSE_STANFORD, POS, NER_CONLL, LEMMA, SHALLOW_PARSE, TOKENS, SENTENCE, PARSE_GOLD]"));
assertTrue(Objects.equals(subTA.getView(ViewNames.SHALLOW_PARSE).toString(), "[NP The paving ] [VP commenced ] [NP Monday ] [VP will finish ] [PP in ] [NP June ] "));
String parse = "(S1 (S (NP (DT The)\n" + " (NN paving))\n" + " (VP (VP (VBD commenced)\n" + " (NP (NNP Monday)))\n" + " (CC and)\n" + " (VP (MD will)\n" + " (VP (VB finish)\n" + " (PP (IN in)\n" + " (NP (NNP June))))))\n" + " (. .)))";
String subTaStr = subTA.getView(ViewNames.PARSE_GOLD).toString().trim();
int subTaOffset = ta.getSentence(2).getStartSpan();
assertTrue(Objects.equals(subTaStr, parse));
String[] viewsToAdd = new String[] {};
TextAnnotation emptyTa = DummyTextAnnotationGenerator.generateAnnotatedTextAnnotation(viewsToAdd, false, 3);
Set<IntPair> srlSpans = new HashSet<>();
for (Constituent c : subTA.getView(ViewNames.SRL_VERB).getConstituents()) srlSpans.add(c.getSpan());
TextAnnotationUtilities.mapSentenceAnnotationsToText(subTA, emptyTa, 2);
View srlView = emptyTa.getView(ViewNames.SRL_VERB);
for (Constituent c : srlView.getConstituents()) {
IntPair cSpan = c.getSpan();
IntPair adjustedSpan = new IntPair(cSpan.getFirst() - subTaOffset, cSpan.getSecond() - subTaOffset);
assertTrue(srlSpans.contains(adjustedSpan));
}
TreeView parseView = (TreeView) emptyTa.getView(ViewNames.PARSE_GOLD);
String mappedParse = parseView.toString().trim();
assertEquals(parse, mappedParse);
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class TestLongestCommonSubsequence method testGetLCSMatchSentence.
// protected void setUp() throws Exception {
//
// }
@Test
public void testGetLCSMatchSentence() {
String s1 = "Dan bought two books.";
String s2 = "Dan bought two books .";
List<IntPair> match = LongestCommonSubsequence.getCharacterLCS(s1, s2);
for (IntPair obj : match) {
assertEquals(s1.charAt(obj.getFirst() - 1), s2.charAt(obj.getSecond() - 1));
}
}
Aggregations