Search in sources :

Example 1 with XmlDocumentProcessor

use of edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor in project cogcomp-nlp by CogComp.

the class XmlDocumentProcessorTest method testXmlDocumentProcessor.

@Test
public void testXmlDocumentProcessor() {
    /*
        <doc id="ENG_DF_001241_20150407_F0000007T">
<headline>
cuba
</headline>
<post id="p1" author="chatmasta" datetime="2015-04-07T14:42:00">

         */
    Map<String, Set<String>> tagsWithAtts = new HashMap<>();
    Set<String> attributeNames = new HashSet<>();
    attributeNames.add("author");
    attributeNames.add("id");
    attributeNames.add("datetime");
    tagsWithAtts.put("post", attributeNames);
    attributeNames = new HashSet<>();
    attributeNames.add("id");
    tagsWithAtts.put("doc", attributeNames);
    Set<String> deletableSpanTags = new HashSet<>();
    deletableSpanTags.add("quote");
    deletableSpanTags.add("distraction");
    Set<String> tagsToIgnore = new HashSet<>();
    tagsToIgnore.add("img");
    tagsToIgnore.add("snip");
    //        StringTransformation origTextSt = new StringTransformation(ORIG_TEXT);
    boolean throwExceptionOnXmlTagMiss = true;
    XmlDocumentProcessor proc = new XmlDocumentProcessor(deletableSpanTags, tagsWithAtts, tagsToIgnore, throwExceptionOnXmlTagMiss);
    Pair<StringTransformation, List<XmlDocumentProcessor.SpanInfo>> nt = proc.processXml(ORIG_TEXT);
    // check that we retained the right attributes, cleaned up the text, generated a sensible cleaned text, and can
    // recover the offsets of strings in the original text.
    StringTransformation st = nt.getFirst();
    List<XmlDocumentProcessor.SpanInfo> retainedTagInfo = nt.getSecond();
    String cleanText = st.getTransformedText();
    assertEquals(ORIG_TEXT, st.getOrigText());
    assertEquals(CLEAN_TEXT, cleanText);
    //        Map<IntPair, String> attrVals = XmlDocumentProcessor.compileAttributeValues(retainedTagInfo);
    Map<IntPair, XmlDocumentProcessor.SpanInfo> offsetToSpans = XmlDocumentProcessor.compileOffsetSpanMapping(retainedTagInfo);
    assertTrue(offsetToSpans.containsKey(POST_OFFSETS));
    XmlDocumentProcessor.SpanInfo spanInfo = offsetToSpans.get(POST_OFFSETS);
    assertTrue(spanInfo.attributes.containsKey(AUTHOR));
    assertEquals(NAME, spanInfo.attributes.get(AUTHOR).getFirst());
    assertEquals(AUTHOR_OFFSETS, spanInfo.attributes.get(AUTHOR).getSecond());
    String origAuthStr = st.getOrigText().substring(AUTHOR_OFFSETS.getFirst(), AUTHOR_OFFSETS.getSecond());
    assertEquals(NAME, origAuthStr);
    assertTrue(offsetToSpans.containsKey(DISTR_OFFSETS));
    spanInfo = offsetToSpans.get(DISTR_OFFSETS);
    assertTrue(spanInfo.label.equals("distraction"));
    assertEquals(DISTR_SUBSTR, ORIG_TEXT.substring(DISTR_OFFSETS.getFirst(), DISTR_OFFSETS.getSecond()));
    assertTrue(offsetToSpans.containsKey(IQ_OFFSETS));
    int iqStart = st.computeModifiedOffsetFromOriginal(IQ_OFFSETS.getFirst());
    int iqEnd = st.computeModifiedOffsetFromOriginal(IQ_OFFSETS.getSecond());
    // deleted
    assertEquals("", cleanText.substring(iqStart, iqEnd));
    assertEquals(ORIG_TEXT.indexOf("Whassup"), IQ_OFFSETS.getFirst());
    int doStart = cleanText.indexOf("do?");
    int doEnd = doStart + 3;
    IntPair origYouOffsets = st.getOriginalOffsets(doStart, doEnd);
    assertEquals("do?", ORIG_TEXT.substring(origYouOffsets.getFirst(), origYouOffsets.getSecond()));
}
Also used : StringTransformation(edu.illinois.cs.cogcomp.core.utilities.StringTransformation) XmlDocumentProcessor(edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Test(org.junit.Test)

Example 2 with XmlDocumentProcessor

use of edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor in project cogcomp-nlp by CogComp.

the class OntonotesNerReaderExample method main.

public static void main(String[] args) throws ClassNotFoundException, SQLException, IOException {
    String inFile = "/shared/corpora/corporaWeb/multi-mode/multi/ontonotes-release-5.0/data/files/data/english/annotations/nw/wsj/00/wsj_0061.name";
    // make sure the output directory exists.
    // "en"
    int counter = 0;
    long start = System.currentTimeMillis();
    // define all tags with text.
    Set<String> tagsWithText = new HashSet<>();
    // define the attributes we want to keep for the tags we have.
    Map<String, Set<String>> tagsWithAtts = new HashMap<>();
    {
        Set<String> docAttrs = new HashSet<>();
        docAttrs.add("docno");
        tagsWithAtts.put("doc", docAttrs);
    }
    {
        Set<String> nameAttrs = new HashSet<>();
        nameAttrs.add("type");
        tagsWithAtts.put("enamex", nameAttrs);
    }
    boolean throwExceptionOnXmlParseFail = true;
    // we keep everything.
    Set<String> dropTags = new HashSet<>();
    XmlDocumentProcessor xmlProcessor = new XmlDocumentProcessor(tagsWithText, tagsWithAtts, dropTags, true);
    StatefulTokenizer st = new StatefulTokenizer();
    TokenizerTextAnnotationBuilder taBuilder = new TokenizerTextAnnotationBuilder(st);
    XmlTextAnnotationMaker xtam = new XmlTextAnnotationMaker(taBuilder, xmlProcessor);
    String document = LineIO.slurp(inFile);
    XmlTextAnnotation xta = xtam.createTextAnnotation(document, "OntoNotes 5.0", "test");
    TextAnnotation ta = xta.getTextAnnotation();
    List<XmlDocumentProcessor.SpanInfo> fudge = xta.getXmlMarkup();
    System.out.println(ta + "\n");
    View nerView = new SpanLabelView(ViewNames.NER_ONTONOTES, ta);
    String cleanText = ta.getText();
    for (XmlDocumentProcessor.SpanInfo si : fudge) {
        if ("enamex".equalsIgnoreCase(si.label)) {
            IntPair charOffsets = si.spanOffsets;
            String neLabel = si.attributes.get("type").getFirst();
            int cleanTextCharStart = xta.getXmlSt().computeModifiedOffsetFromOriginal(charOffsets.getFirst());
            int cleanTextCharEnd = xta.getXmlSt().computeModifiedOffsetFromOriginal(charOffsets.getSecond());
            System.err.println("ne string: '" + cleanText.substring(cleanTextCharStart, cleanTextCharEnd) + "'");
            int cleanTextNeTokStart = ta.getTokenIdFromCharacterOffset(cleanTextCharStart);
            // StringTransformation returns one-past-the-end index; TextAnnotation maps at-the-end index
            int cleanTextNeTokEnd = ta.getTokenIdFromCharacterOffset(cleanTextCharEnd - 1);
            //constituent token indexing uses one-past-the-end
            Constituent neCon = new Constituent(neLabel, nerView.getViewName(), ta, cleanTextNeTokStart, cleanTextNeTokEnd + 1);
            nerView.addConstituent(neCon);
        }
        counter++;
        System.out.println("Read " + counter + " documents in " + (System.currentTimeMillis() - start));
        System.out.println(nerView.toString());
    }
}
Also used : XmlDocumentProcessor(edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) StatefulTokenizer(edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) XmlTextAnnotationMaker(edu.illinois.cs.cogcomp.annotation.XmlTextAnnotationMaker)

Example 3 with XmlDocumentProcessor

use of edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor in project cogcomp-nlp by CogComp.

the class EREDocumentReader method buildXmlTextAnnotationMaker.

/**
     * builds an {@link XmlTextAnnotationMaker} expecting ERE annotation.  {@link TextAnnotationBuilder} must be
     * configured for the target language.
     *
     * @param textAnnotationBuilder a TextAnnotationBuilder with tokenizer suited to target language.
     * @param throwExceptionOnXmlParseFail if 'true', the XmlTextAnnotationMaker will throw an exception if any
     *                                     errors are found in the source xml.
     * @return an XmlTextAnnotationMaker configured to parse an ERE corpus.
     */
public static XmlTextAnnotationMaker buildXmlTextAnnotationMaker(TextAnnotationBuilder textAnnotationBuilder, EreCorpus ereCorpus, boolean throwExceptionOnXmlParseFail) {
    Map<String, Set<String>> tagsWithAtts = new HashMap<>();
    Set<String> attributeNames = new HashSet<>();
    attributeNames.add(AUTHOR);
    attributeNames.add(ID);
    attributeNames.add(DATETIME);
    tagsWithAtts.put(POST, attributeNames);
    attributeNames = new HashSet<>();
    attributeNames.add(ID);
    tagsWithAtts.put(DOC, attributeNames);
    attributeNames = new HashSet<>();
    attributeNames.add(ORIG_AUTHOR);
    tagsWithAtts.put(QUOTE, attributeNames);
    Set<String> deletableSpanTags = new HashSet<>();
    // for release 3 only, quoted blocks are NOT annotated
    if (EreCorpus.ENR3.equals(ereCorpus))
        deletableSpanTags.add(QUOTE);
    // implies "delete spans enclosed by these tags"
    Set<String> tagsToIgnore = new HashSet<>();
    tagsToIgnore.add(IMG);
    tagsToIgnore.add(SNIP);
    tagsToIgnore.add(SQUISH);
    XmlDocumentProcessor xmlProcessor = new XmlDocumentProcessor(deletableSpanTags, tagsWithAtts, tagsToIgnore, throwExceptionOnXmlParseFail);
    return new XmlTextAnnotationMaker(textAnnotationBuilder, xmlProcessor);
}
Also used : XmlDocumentProcessor(edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor) XmlTextAnnotationMaker(edu.illinois.cs.cogcomp.annotation.XmlTextAnnotationMaker)

Aggregations

XmlDocumentProcessor (edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor)3 XmlTextAnnotationMaker (edu.illinois.cs.cogcomp.annotation.XmlTextAnnotationMaker)2 IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)2 StringTransformation (edu.illinois.cs.cogcomp.core.utilities.StringTransformation)1 StatefulTokenizer (edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer)1 TokenizerTextAnnotationBuilder (edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder)1 Test (org.junit.Test)1