Search in sources :

Example 1 with StringTransformation

use of edu.illinois.cs.cogcomp.core.utilities.StringTransformation in project cogcomp-nlp by CogComp.

the class ERENerReader method getTokenOffsets.

/**
 * find the token offsets in the TextAnnotation that correspond to the source character offsets for the given
 * mention
 *
 * @param origStartOffset start character offset from xml markup
 * @param origEndOffset   end character offset from xml markup
 * @param mentionForm     mention form from xml markup
 * @param xmlTa           XmlTextAnnotation object storing original xml, transformed text, extracted xml markup,
 *                        and corresponding TextAnnotation
 * @return Intpair(-1, -1) if the specified offsets correspond to deleted span (and hence likely a name mention
 * in xml metadata, e.g. post author); null if no mapped tokens could be found (possibly, indexes refer
 * to the middle of a single token because tokenizer can't segment some strings); or the corresponding
 * token indexes
 */
protected IntPair getTokenOffsets(int origStartOffset, int origEndOffset, String mentionForm, XmlTextAnnotation xmlTa) {
    StringTransformation st = xmlTa.getXmlSt();
    String origStr = st.getOrigText().substring(origStartOffset, origEndOffset);
    if (origStr.startsWith(" ") || origStr.startsWith("\n")) {
        origStartOffset += 1;
        origEndOffset += 1;
    }
    int adjStart = st.computeModifiedOffsetFromOriginal(origStartOffset);
    int adjEnd = st.computeModifiedOffsetFromOriginal(origEndOffset);
    if (adjStart == adjEnd) {
        // probably, maps to span deleted when creating cleaned-up text
        return new IntPair(-1, -1);
    }
    IntPair returnOffset = null;
    int si = 0, ei = 0;
    TextAnnotation ta = xmlTa.getTextAnnotation();
    String rawText = ta.getText();
    String rawStr = rawText.substring(adjStart, adjEnd);
    logger.debug("source xml str: '" + origStr + "' (" + origStartOffset + "," + origEndOffset + ")");
    try {
        si = findStartIndex(adjStart);
        ei = findEndIndex(adjEnd, rawText);
        returnOffset = new IntPair(si, ei);
    } catch (IllegalArgumentException iae) {
        logger.error("could not find token offsets for mention form '" + mentionForm + ", start, end orig: (" + origStartOffset + "," + origEndOffset + "); adjusted: (" + adjStart + "," + adjEnd + ").");
        System.exit(1);
    } catch (RuntimeException re) {
        numOffsetErrors++;
        logger.error("Error finding text for '{}' at offsets {}:", rawStr, (adjStart + "-" + adjEnd));
        boolean siwaszero = false;
        if (si == 0) {
            siwaszero = true;
        }
        si = findStartIndexIgnoreError(adjStart);
        ei = findEndIndexIgnoreError(adjEnd);
        if (siwaszero)
            logger.error("Could not find start token : text='" + mentionForm + "' at adjusted offsets " + adjStart + " to " + adjEnd);
        else
            logger.error("Could not find end token : text='" + mentionForm + "' at adjusted offsets " + adjStart + " to " + adjEnd);
        int max = ta.getTokens().length;
        int start = si >= 2 ? si - 2 : 0;
        int end = (ei + 2) < max ? ei + 2 : max;
        StringBuilder bldr = new StringBuilder();
        for (int jj = start; jj < end; jj++) {
            bldr.append(" ");
            if (jj == si)
                bldr.append(":");
            bldr.append(ta.getToken(jj));
            if (jj == ei)
                bldr.append(":");
            bldr.append(" ");
        }
        bldr.append("\n");
        logger.error(bldr.toString());
    }
    return returnOffset;
}
Also used : StringTransformation(edu.illinois.cs.cogcomp.core.utilities.StringTransformation) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair)

Example 2 with StringTransformation

use of edu.illinois.cs.cogcomp.core.utilities.StringTransformation in project cogcomp-nlp by CogComp.

the class EREReaderTest method runTest.

private static XmlTextAnnotation runTest(EreCorpus ereCorpus, String corpusRoot) {
    ERENerReader nerReader = null;
    boolean addNominalMentions = true;
    boolean throwExceptionOnXmlTagMismatch = true;
    try {
        nerReader = new EREMentionRelationReader(ereCorpus, corpusRoot, throwExceptionOnXmlTagMismatch);
    } catch (Exception e) {
        e.printStackTrace();
        System.err.println("ERROR: " + NAME + ": couldn't instantiate ERENerReader for ERE release " + ereCorpus.name() + ": " + e.getMessage());
    }
    XmlTextAnnotation outputXmlTa = nerReader.next();
    TextAnnotation output = outputXmlTa.getTextAnnotation();
    // Test TextAnnotationUtilities.mapTransformedTextAnnotationToSource()
    TextAnnotation mappedTa = TextAnnotationUtilities.mapTransformedTextAnnotationToSource(output, outputXmlTa.getXmlSt());
    assertEquals(mappedTa.getView(ViewNames.TOKENS).getNumberOfConstituents(), output.getView(ViewNames.TOKENS).getNumberOfConstituents());
    assertEquals(mappedTa.getView(ViewNames.SENTENCE).getNumberOfConstituents(), output.getView(ViewNames.SENTENCE).getNumberOfConstituents());
    View nerEre = null;
    if (addNominalMentions) {
        assert (output.hasView(ViewNames.MENTION_ERE));
        nerEre = output.getView(ViewNames.MENTION_ERE);
    } else {
        assert (output.hasView(ViewNames.NER_ERE));
        nerEre = output.getView(ViewNames.NER_ERE);
    }
    assert (nerEre.getConstituents().size() > 0);
    StringTransformation xmlSt = outputXmlTa.getXmlSt();
    String origXmlStr = xmlSt.getOrigText();
    System.out.println("ERENerReader found " + nerEre.getConstituents().size() + " NER constituents: ");
    for (Constituent c : nerEre.getConstituents()) {
        System.out.println(TextAnnotationPrintHelper.printConstituent(c));
        int start = c.getStartCharOffset();
        int end = c.getEndCharOffset();
        IntPair origOffsets = xmlSt.getOriginalOffsets(start, end);
        String origStr = origXmlStr.substring(origOffsets.getFirst(), origOffsets.getSecond());
        System.out.println("Constituent (clean) text: '" + c.getSurfaceForm() + "'");
        System.out.println("Original text: '" + origStr + "'\n---------\n");
    }
    System.out.println("Report: " + nerReader.generateReport());
    return outputXmlTa;
}
Also used : EREMentionRelationReader(edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREMentionRelationReader) ERENerReader(edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.ERENerReader) StringTransformation(edu.illinois.cs.cogcomp.core.utilities.StringTransformation) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) IOException(java.io.IOException)

Example 3 with StringTransformation

use of edu.illinois.cs.cogcomp.core.utilities.StringTransformation in project cogcomp-nlp by CogComp.

the class XmlTextAnnotationMakerTest method testWithFile.

private static void testWithFile(XmlTextAnnotationMaker maker, String xmlFile) {
    String xmlStr = null;
    try {
        xmlStr = LineIO.slurp(xmlFile);
    } catch (FileNotFoundException e) {
        e.printStackTrace();
        System.exit(-1);
    }
    XmlTextAnnotation output = maker.createTextAnnotation(xmlStr, "test", "test");
    TextAnnotation ta = output.getTextAnnotation();
    Sentence firstSentence = ta.getSentence(0);
    String firstSentenceText = firstSentence.getText();
    System.out.println(firstSentenceText);
    Constituent thirdWord = ta.getView(ViewNames.TOKENS).getConstituentsCoveringSpan(2, 3).get(0);
    int thirdStartChar = thirdWord.getStartCharOffset();
    int thirdEndChar = thirdWord.getEndCharOffset();
    String thirdWordForm = thirdWord.getSurfaceForm();
    StringTransformation st = output.getXmlSt();
    IntPair origSpan = st.getOriginalOffsets(thirdStartChar, thirdEndChar);
    // int origStartChar = st.computeOriginalOffset(thirdStartChar);
    // int origEndChar = st.computeOriginalOffset(thirdEndChar);
    // String origWordForm = xmlStr.substring(origStartChar, origEndChar);
    String origWordForm = st.getOrigText().substring(origSpan.getFirst(), origSpan.getSecond());
    System.out.println("Third word: " + thirdWordForm);
    String transformStr = st.getTransformedText().substring(thirdStartChar, thirdEndChar);
    System.out.println("corresponding substring from transformed text: " + transformStr);
    System.out.println("original text substring using mapped offsets: " + origWordForm);
    if (!transformStr.equals(origWordForm))
        System.err.println("ERROR: test failed: word '" + transformStr + "' not identical to original word '" + origWordForm + "'. ");
    View mentionView = output.getTextAnnotation().getView(ViewNames.SENTENCE);
    for (Constituent c : mentionView.getConstituents()) {
        int start = c.getStartCharOffset();
        int end = c.getEndCharOffset();
        String cleanForm = c.getSurfaceForm();
        IntPair sourceSpan = st.getOriginalOffsets(start, end);
        System.out.println("------\nclean: " + cleanForm + ", (" + start + ", " + end + ")");
        System.out.println("------\nsource: " + st.getOrigText().substring(sourceSpan.getFirst(), sourceSpan.getSecond()) + ", (" + sourceSpan.getFirst() + ", " + sourceSpan.getSecond() + ")\n");
    }
    List<XmlDocumentProcessor.SpanInfo> markup = output.getXmlMarkup();
    Map<IntPair, XmlDocumentProcessor.SpanInfo> markupMap = XmlDocumentProcessor.compileOffsetSpanMapping(markup);
    for (IntPair offsets : markupMap.keySet()) {
        System.out.print(offsets.getFirst() + "-" + offsets.getSecond() + ": ");
        Map<String, Pair<String, IntPair>> attVals = markupMap.get(offsets).attributes;
        for (String attType : attVals.keySet()) System.out.println(attType + ": " + attVals.get(attType).getFirst());
        System.out.println();
    }
}
Also used : FileNotFoundException(java.io.FileNotFoundException) StringTransformation(edu.illinois.cs.cogcomp.core.utilities.StringTransformation) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair)

Example 4 with StringTransformation

use of edu.illinois.cs.cogcomp.core.utilities.StringTransformation in project cogcomp-nlp by CogComp.

the class XmlDocumentProcessorTest method testXmlDocumentProcessor.

@Test
public void testXmlDocumentProcessor() {
    /*
        <doc id="ENG_DF_001241_20150407_F0000007T">
<headline>
cuba
</headline>
<post id="p1" author="chatmasta" datetime="2015-04-07T14:42:00">

         */
    Map<String, Set<String>> tagsWithAtts = new HashMap<>();
    Set<String> attributeNames = new HashSet<>();
    attributeNames.add("author");
    attributeNames.add("id");
    attributeNames.add("datetime");
    tagsWithAtts.put("post", attributeNames);
    attributeNames = new HashSet<>();
    attributeNames.add("id");
    tagsWithAtts.put("doc", attributeNames);
    Set<String> deletableSpanTags = new HashSet<>();
    deletableSpanTags.add("quote");
    deletableSpanTags.add("distraction");
    Set<String> tagsToIgnore = new HashSet<>();
    tagsToIgnore.add("img");
    tagsToIgnore.add("snip");
    // StringTransformation origTextSt = new StringTransformation(ORIG_TEXT);
    boolean throwExceptionOnXmlTagMiss = true;
    XmlDocumentProcessor proc = new XmlDocumentProcessor(deletableSpanTags, tagsWithAtts, tagsToIgnore, throwExceptionOnXmlTagMiss);
    Pair<StringTransformation, List<XmlDocumentProcessor.SpanInfo>> nt = proc.processXml(ORIG_TEXT);
    // check that we retained the right attributes, cleaned up the text, generated a sensible cleaned text, and can
    // recover the offsets of strings in the original text.
    StringTransformation st = nt.getFirst();
    List<XmlDocumentProcessor.SpanInfo> retainedTagInfo = nt.getSecond();
    String cleanText = st.getTransformedText();
    assertEquals(ORIG_TEXT, st.getOrigText());
    assertEquals(CLEAN_TEXT, cleanText);
    // Map<IntPair, String> attrVals = XmlDocumentProcessor.compileAttributeValues(retainedTagInfo);
    Map<IntPair, XmlDocumentProcessor.SpanInfo> offsetToSpans = XmlDocumentProcessor.compileOffsetSpanMapping(retainedTagInfo);
    assertTrue(offsetToSpans.containsKey(POST_OFFSETS));
    XmlDocumentProcessor.SpanInfo spanInfo = offsetToSpans.get(POST_OFFSETS);
    assertTrue(spanInfo.attributes.containsKey(AUTHOR));
    assertEquals(NAME, spanInfo.attributes.get(AUTHOR).getFirst());
    assertEquals(AUTHOR_OFFSETS, spanInfo.attributes.get(AUTHOR).getSecond());
    String origAuthStr = st.getOrigText().substring(AUTHOR_OFFSETS.getFirst(), AUTHOR_OFFSETS.getSecond());
    assertEquals(NAME, origAuthStr);
    assertTrue(offsetToSpans.containsKey(DISTR_OFFSETS));
    spanInfo = offsetToSpans.get(DISTR_OFFSETS);
    assertTrue(spanInfo.label.equals("distraction"));
    assertEquals(DISTR_SUBSTR, ORIG_TEXT.substring(DISTR_OFFSETS.getFirst(), DISTR_OFFSETS.getSecond()));
    assertTrue(offsetToSpans.containsKey(IQ_OFFSETS));
    int iqStart = st.computeModifiedOffsetFromOriginal(IQ_OFFSETS.getFirst());
    int iqEnd = st.computeModifiedOffsetFromOriginal(IQ_OFFSETS.getSecond());
    // deleted
    assertEquals("", cleanText.substring(iqStart, iqEnd));
    assertEquals(ORIG_TEXT.indexOf("Whassup"), IQ_OFFSETS.getFirst());
    int doStart = cleanText.indexOf("do?");
    int doEnd = doStart + 3;
    IntPair origYouOffsets = st.getOriginalOffsets(doStart, doEnd);
    assertEquals("do?", ORIG_TEXT.substring(origYouOffsets.getFirst(), origYouOffsets.getSecond()));
}
Also used : StringTransformation(edu.illinois.cs.cogcomp.core.utilities.StringTransformation) XmlDocumentProcessor(edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Test(org.junit.Test)

Example 5 with StringTransformation

use of edu.illinois.cs.cogcomp.core.utilities.StringTransformation in project cogcomp-nlp by CogComp.

the class StringTransformationTest method testReduce.

@Test
public void testReduce() {
    StringTransformation st = new StringTransformation(REDUCE);
    // "http://org.edu.net/killit say it's a leg";
    st.transformString(0, 25, "WWW");
    String modifiedStr = st.getTransformedText();
    assertEquals(REDUCE, st.getOrigText());
    assertEquals(REDUCE.length() - 22, modifiedStr.length());
    assertEquals(MODREDUCE, modifiedStr);
    int modStart = st.computeModifiedOffsetFromOriginal(0);
    int modEnd = st.computeModifiedOffsetFromOriginal(25);
    assertEquals(0, modStart);
    assertEquals(3, modEnd);
    /*
         * what happens if we query a char in the middle of a deleted sequence?
         * -- should map to beginning of that modification
         */
    int modMid = st.computeModifiedOffsetFromOriginal(20);
    assertEquals(3, modMid);
    IntPair origOffsets = st.getOriginalOffsets(0, 3);
    assertEquals(0, origOffsets.getFirst());
    assertEquals(25, origOffsets.getSecond());
    // intermediate edit chars map to same offsets, treated like replacements
    origOffsets = st.getOriginalOffsets(1, 2);
    assertEquals(1, origOffsets.getFirst());
    assertEquals(2, origOffsets.getSecond());
    // 1 past the end of the edit
    origOffsets = st.getOriginalOffsets(1, 4);
    assertEquals(26, origOffsets.getSecond());
}
Also used : StringTransformation(edu.illinois.cs.cogcomp.core.utilities.StringTransformation) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Test(org.junit.Test)

Aggregations

StringTransformation (edu.illinois.cs.cogcomp.core.utilities.StringTransformation)18 IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)14 Test (org.junit.Test)12 XmlDocumentProcessor (edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor)4 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)3 XmlTextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation)3 IOException (java.io.IOException)2 XmlTextAnnotationMaker (edu.illinois.cs.cogcomp.annotation.XmlTextAnnotationMaker)1 Pair (edu.illinois.cs.cogcomp.core.datastructures.Pair)1 TextCleanerStringTransformation (edu.illinois.cs.cogcomp.core.utilities.TextCleanerStringTransformation)1 EREMentionRelationReader (edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREMentionRelationReader)1 ERENerReader (edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.ERENerReader)1 StatefulTokenizer (edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer)1 TokenizerTextAnnotationBuilder (edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder)1 FileNotFoundException (java.io.FileNotFoundException)1 List (java.util.List)1 Set (java.util.Set)1