Search in sources :

Example 41 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class XmlDocumentProcessorTest method testXmlDocumentProcessor.

@Test
public void testXmlDocumentProcessor() {
    /*
        <doc id="ENG_DF_001241_20150407_F0000007T">
<headline>
cuba
</headline>
<post id="p1" author="chatmasta" datetime="2015-04-07T14:42:00">

         */
    Map<String, Set<String>> tagsWithAtts = new HashMap<>();
    Set<String> attributeNames = new HashSet<>();
    attributeNames.add("author");
    attributeNames.add("id");
    attributeNames.add("datetime");
    tagsWithAtts.put("post", attributeNames);
    attributeNames = new HashSet<>();
    attributeNames.add("id");
    tagsWithAtts.put("doc", attributeNames);
    Set<String> deletableSpanTags = new HashSet<>();
    deletableSpanTags.add("quote");
    deletableSpanTags.add("distraction");
    Set<String> tagsToIgnore = new HashSet<>();
    tagsToIgnore.add("img");
    tagsToIgnore.add("snip");
    // StringTransformation origTextSt = new StringTransformation(ORIG_TEXT);
    boolean throwExceptionOnXmlTagMiss = true;
    XmlDocumentProcessor proc = new XmlDocumentProcessor(deletableSpanTags, tagsWithAtts, tagsToIgnore, throwExceptionOnXmlTagMiss);
    Pair<StringTransformation, List<XmlDocumentProcessor.SpanInfo>> nt = proc.processXml(ORIG_TEXT);
    // check that we retained the right attributes, cleaned up the text, generated a sensible cleaned text, and can
    // recover the offsets of strings in the original text.
    StringTransformation st = nt.getFirst();
    List<XmlDocumentProcessor.SpanInfo> retainedTagInfo = nt.getSecond();
    String cleanText = st.getTransformedText();
    assertEquals(ORIG_TEXT, st.getOrigText());
    assertEquals(CLEAN_TEXT, cleanText);
    // Map<IntPair, String> attrVals = XmlDocumentProcessor.compileAttributeValues(retainedTagInfo);
    Map<IntPair, XmlDocumentProcessor.SpanInfo> offsetToSpans = XmlDocumentProcessor.compileOffsetSpanMapping(retainedTagInfo);
    assertTrue(offsetToSpans.containsKey(POST_OFFSETS));
    XmlDocumentProcessor.SpanInfo spanInfo = offsetToSpans.get(POST_OFFSETS);
    assertTrue(spanInfo.attributes.containsKey(AUTHOR));
    assertEquals(NAME, spanInfo.attributes.get(AUTHOR).getFirst());
    assertEquals(AUTHOR_OFFSETS, spanInfo.attributes.get(AUTHOR).getSecond());
    String origAuthStr = st.getOrigText().substring(AUTHOR_OFFSETS.getFirst(), AUTHOR_OFFSETS.getSecond());
    assertEquals(NAME, origAuthStr);
    assertTrue(offsetToSpans.containsKey(DISTR_OFFSETS));
    spanInfo = offsetToSpans.get(DISTR_OFFSETS);
    assertTrue(spanInfo.label.equals("distraction"));
    assertEquals(DISTR_SUBSTR, ORIG_TEXT.substring(DISTR_OFFSETS.getFirst(), DISTR_OFFSETS.getSecond()));
    assertTrue(offsetToSpans.containsKey(IQ_OFFSETS));
    int iqStart = st.computeModifiedOffsetFromOriginal(IQ_OFFSETS.getFirst());
    int iqEnd = st.computeModifiedOffsetFromOriginal(IQ_OFFSETS.getSecond());
    // deleted
    assertEquals("", cleanText.substring(iqStart, iqEnd));
    assertEquals(ORIG_TEXT.indexOf("Whassup"), IQ_OFFSETS.getFirst());
    int doStart = cleanText.indexOf("do?");
    int doEnd = doStart + 3;
    IntPair origYouOffsets = st.getOriginalOffsets(doStart, doEnd);
    assertEquals("do?", ORIG_TEXT.substring(origYouOffsets.getFirst(), origYouOffsets.getSecond()));
}
Also used : StringTransformation(edu.illinois.cs.cogcomp.core.utilities.StringTransformation) XmlDocumentProcessor(edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Test(org.junit.Test)

Example 42 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class ParseUtils method getSpanLabeledTree.

private static Pair<Tree<Pair<String, IntPair>>, Integer> getSpanLabeledTree(Tree<String> parseTree, int currentLeafId) {
    if (parseTree.isLeaf()) {
        IntPair span;
        if (ParseTreeProperties.isNullLabel(parseTree.getParent().getLabel())) {
            span = new IntPair(currentLeafId, currentLeafId);
        } else {
            span = new IntPair(currentLeafId, currentLeafId + 1);
            currentLeafId++;
        }
        Pair<String, IntPair> label = new Pair<>(parseTree.getLabel(), span);
        Tree<Pair<String, IntPair>> tree = new Tree<>(label);
        return new Pair<>(tree, currentLeafId);
    }
    List<Tree<Pair<String, IntPair>>> children = new ArrayList<>();
    int start = currentLeafId;
    for (Tree<String> child : parseTree.getChildren()) {
        Pair<Tree<Pair<String, IntPair>>, Integer> tmp = getSpanLabeledTree(child, currentLeafId);
        currentLeafId = tmp.getSecond();
        children.add(tmp.getFirst());
    }
    int end = currentLeafId;
    IntPair span = new IntPair(start, end);
    Pair<String, IntPair> label = new Pair<>(parseTree.getLabel(), span);
    Tree<Pair<String, IntPair>> output = new Tree<>(label);
    for (Tree<Pair<String, IntPair>> child : children) {
        output.addSubtree(child);
    }
    return new Pair<>(output, currentLeafId);
}
Also used : ArrayList(java.util.ArrayList) Tree(edu.illinois.cs.cogcomp.core.datastructures.trees.Tree) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair)

Example 43 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class ParseUtils method getTokenIndexedParseTreeNodeCovering.

public static Tree<Pair<String, IntPair>> getTokenIndexedParseTreeNodeCovering(String parseViewName, Constituent c) {
    // / UGLY CODE ALERT!!!
    TextAnnotation ta = c.getTextAnnotation();
    int sentenceId = ta.getSentenceId(c);
    Tree<String> tree = getParseTree(parseViewName, ta, sentenceId);
    final int sentenceStartSpan = ta.getSentence(sentenceId).getStartSpan();
    int start = c.getStartSpan() - sentenceStartSpan;
    int end = c.getEndSpan() - sentenceStartSpan;
    // Find the tree that covers the start and end tokens. However, start
    // and end have been shifted relative to the start of the sentence. So
    // we need to shift it back, which is why we have that UGLY as sin
    // mapper at the end.
    Tree<Pair<String, IntPair>> toknTree = getTokenIndexedTreeCovering(tree, start, end);
    ITransformer<Tree<Pair<String, IntPair>>, Pair<String, IntPair>> transformer = new ITransformer<Tree<Pair<String, IntPair>>, Pair<String, IntPair>>() {

        @Override
        public Pair<String, IntPair> transform(Tree<Pair<String, IntPair>> input) {
            Pair<String, IntPair> label = input.getLabel();
            IntPair newSpan = new IntPair(label.getSecond().getFirst() + sentenceStartSpan, label.getSecond().getSecond() + sentenceStartSpan);
            return new Pair<>(label.getFirst(), newSpan);
        }
    };
    return Mappers.mapTree(toknTree, transformer);
}
Also used : ITransformer(edu.illinois.cs.cogcomp.core.transformers.ITransformer) Tree(edu.illinois.cs.cogcomp.core.datastructures.trees.Tree) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair)

Example 44 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class TextAnnotationUtilitiesTest method test.

/**
 * note that this test checks getSubTextAnnotation()'s behavior when a sentence has no constituents in
 *   a particular view for a particular sentence (view is not generated in sentence level version)
 */
@Test
public void test() {
    TextAnnotation ta = DummyTextAnnotationGenerator.generateAnnotatedTextAnnotation(false, 3);
    TextAnnotation subTA = TextAnnotationUtilities.getSubTextAnnotation(ta, 2);
    assertTrue(subTA.getText().equals("The paving commenced Monday and will finish in June ."));
    assertTrue(Objects.equals(subTA.getAvailableViews().toString(), "[SRL_VERB, PSEUDO_PARSE_STANFORD, POS, NER_CONLL, LEMMA, SHALLOW_PARSE, TOKENS, SENTENCE, PARSE_GOLD]"));
    assertTrue(Objects.equals(subTA.getView(ViewNames.SHALLOW_PARSE).toString(), "[NP The paving ] [VP commenced ] [NP Monday ] [VP will finish ] [PP in ] [NP June ] "));
    String parse = "(S1 (S (NP (DT The)\n" + "    (NN paving))\n" + "   (VP (VP (VBD commenced)\n" + "    (NP (NNP Monday)))\n" + "       (CC and)\n" + "       (VP (MD will)\n" + "           (VP (VB finish)\n" + "               (PP (IN in)\n" + "                   (NP (NNP June))))))\n" + "   (. .)))";
    String subTaStr = subTA.getView(ViewNames.PARSE_GOLD).toString().trim();
    int subTaOffset = ta.getSentence(2).getStartSpan();
    assertTrue(Objects.equals(subTaStr, parse));
    String[] viewsToAdd = new String[] {};
    TextAnnotation emptyTa = DummyTextAnnotationGenerator.generateAnnotatedTextAnnotation(viewsToAdd, false, 3);
    Set<IntPair> srlSpans = new HashSet<>();
    for (Constituent c : subTA.getView(ViewNames.SRL_VERB).getConstituents()) srlSpans.add(c.getSpan());
    TextAnnotationUtilities.mapSentenceAnnotationsToText(subTA, emptyTa, 2);
    View srlView = emptyTa.getView(ViewNames.SRL_VERB);
    for (Constituent c : srlView.getConstituents()) {
        IntPair cSpan = c.getSpan();
        IntPair adjustedSpan = new IntPair(cSpan.getFirst() - subTaOffset, cSpan.getSecond() - subTaOffset);
        assertTrue(srlSpans.contains(adjustedSpan));
    }
    TreeView parseView = (TreeView) emptyTa.getView(ViewNames.PARSE_GOLD);
    String mappedParse = parseView.toString().trim();
    assertEquals(parse, mappedParse);
}
Also used : IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 45 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class TestLongestCommonSubsequence method testGetLCSMatchSentence.

// protected void setUp() throws Exception {
// 
// }
@Test
public void testGetLCSMatchSentence() {
    String s1 = "Dan bought two books.";
    String s2 = "Dan bought two books .";
    List<IntPair> match = LongestCommonSubsequence.getCharacterLCS(s1, s2);
    for (IntPair obj : match) {
        assertEquals(s1.charAt(obj.getFirst() - 1), s2.charAt(obj.getSecond() - 1));
    }
}
Also used : IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Test(org.junit.Test)

Aggregations

IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)129 Pair (edu.illinois.cs.cogcomp.core.datastructures.Pair)39 ArrayList (java.util.ArrayList)27 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)26 Test (org.junit.Test)21 Constituent (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)18 StringTransformation (edu.illinois.cs.cogcomp.core.utilities.StringTransformation)14 XmlDocumentProcessor (edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor)8 SpanLabelView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView)7 Tree (edu.illinois.cs.cogcomp.core.datastructures.trees.Tree)6 TokenizerTextAnnotationBuilder (edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder)5 Matcher (java.util.regex.Matcher)5 View (edu.illinois.cs.cogcomp.core.datastructures.textannotation.View)4 XmlTextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation)4 Sentence (edu.illinois.cs.cogcomp.lbjava.nlp.Sentence)4 FileNotFoundException (java.io.FileNotFoundException)4 IOException (java.io.IOException)4 JsonObject (com.google.gson.JsonObject)3 TextAnnotationBuilder (edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder)3 PredicateArgumentView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.PredicateArgumentView)3