Search in sources :

Example 16 with StringTransformation

use of edu.illinois.cs.cogcomp.core.utilities.StringTransformation in project cogcomp-nlp by CogComp.

the class TACReaderTest method main.

public static void main(String[] args) {
    TACReader tacReader = null;
    try {
        tacReader = new TACReader(CORPUS_ROOT, true);
    } catch (Exception e) {
        e.printStackTrace();
        System.err.println("ERROR: " + NAME + ": couldn't instantiate TACReader: " + e.getMessage());
    }
    String wantedId = "ENG_NW_001278_20130318_F00012HTB.xml";
    XmlTextAnnotation outputXmlTa = null;
    do {
        try {
            outputXmlTa = tacReader.next();
        } catch (IllegalStateException e) {
            e.printStackTrace();
        }
    } while (!outputXmlTa.getTextAnnotation().getId().equals(wantedId) && tacReader.hasNext());
    if (!outputXmlTa.getTextAnnotation().getId().equals(wantedId))
        fail("ERROR: didn't find corpus entry with id '" + wantedId + "'.");
    TextAnnotation output = outputXmlTa.getTextAnnotation();
    StringTransformation xmlSt = outputXmlTa.getXmlSt();
    String origXml = xmlSt.getOrigText();
    List<XmlDocumentProcessor.SpanInfo> markup = outputXmlTa.getXmlMarkup();
    Map<IntPair, XmlDocumentProcessor.SpanInfo> markupInfo = XmlDocumentProcessor.compileOffsetSpanMapping(markup);
    Map<IntPair, Set<String>> markupAttributes = XmlDocumentProcessor.compileAttributeValues(markup);
    Set<String> docIdReported = markupAttributes.get(IDOFFSETS);
    assert (docIdReported.contains(ID));
    assertEquals(DATETIMEVAL, origXml.substring(DATETIMEOFFSETS.getFirst(), DATETIMEOFFSETS.getSecond()));
    assertEquals(AUTHORVAL, origXml.substring(AUTHOROFFSETS.getFirst(), AUTHOROFFSETS.getSecond()));
}
Also used : XmlTextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation) Set(java.util.Set) StringTransformation(edu.illinois.cs.cogcomp.core.utilities.StringTransformation) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) XmlTextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation)

Example 17 with StringTransformation

use of edu.illinois.cs.cogcomp.core.utilities.StringTransformation in project cogcomp-nlp by CogComp.

the class XmlTextAnnotationMakerOntonotesTest method testNestedNames.

/**
 * the edit offsets get messed up when there are nested tags.
 */
@Test
public void testNestedNames() {
    String text = "He spoke with Paul <ENAMEX TYPE=\"PERSON\"><ENAMEX TYPE=\"PERSON\" E_OFF=\"1\">Paula</ENAMEX> Zahn</ENAMEX> .";
    // we keep everything.
    XmlDocumentProcessor xmlProcessor = new XmlDocumentProcessor(tagsWithText, tagsWithAtts, dropTags, true);
    StatefulTokenizer st = new StatefulTokenizer();
    TokenizerTextAnnotationBuilder taBuilder = new TokenizerTextAnnotationBuilder(st);
    XmlTextAnnotationMaker xtam = new XmlTextAnnotationMaker(taBuilder, xmlProcessor);
    // read the file and create the annotation.
    XmlTextAnnotation xta = xtam.createTextAnnotation(text, "OntoNotes 5.0", "test");
    TextAnnotation ta = xta.getTextAnnotation();
    List<XmlDocumentProcessor.SpanInfo> fudge = xta.getXmlMarkup();
    StringTransformation xst = xta.getXmlSt();
    for (XmlDocumentProcessor.SpanInfo si : fudge) {
        int newTextStart = xst.computeModifiedOffsetFromOriginal(si.spanOffsets.getFirst());
        int newTextEnd = xst.computeModifiedOffsetFromOriginal(si.spanOffsets.getSecond());
        String neStr = ta.getText().substring(newTextStart, newTextEnd);
        assertTrue(REF_ENTITIES.contains(neStr));
    }
}
Also used : XmlTextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation) StatefulTokenizer(edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) StringTransformation(edu.illinois.cs.cogcomp.core.utilities.StringTransformation) XmlDocumentProcessor(edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor) XmlTextAnnotationMaker(edu.illinois.cs.cogcomp.annotation.XmlTextAnnotationMaker) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) XmlTextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation) Test(org.junit.Test)

Example 18 with StringTransformation

use of edu.illinois.cs.cogcomp.core.utilities.StringTransformation in project cogcomp-nlp by CogComp.

the class EREReaderTest method main.

// 
// "/shared/corpora/corporaWeb/deft/eng/LDC2016E31_DEFT_Rich_ERE_English_Training_Annotation_R3/" +
// "data/source/ENG_DF_001241_20150407_F0000007T.xml";
// public void testNerReader() {
/**
 * there are THREE ERE English releases.
 * Regrettably, they do not follow consistent standards for organization or for annotation.
 *
 * LDC2015E29_DEFT_Rich_ERE English V2 has two sets of annotation files: one, used for the Event Argument Extraction
 *    task in TAC that year, includes a small amount of additional markup to make each xml document well-formed.
 *    This changes the annotation offsets. Taggable entities within quoted blocks are annotated.
 *
 * LDC2015E68_DEFT_Rich_ERE_English R2_V2 has as source files excerpts from multi-post discussion forum documents.
 * Taggable entities within quoted blocks are annotated.
 *
 * LDC2016E31_DEFT_Rich_ERE_English ENR3 has -- I believe -- complete threads, where annotation files may be
 *    broken into several chunks. Taggable entities within quoted blocks are NOT marked.
 *
 * There are two Spanish and two Chinese ERE releases (aside from a parallel English-Chinese release).
 * Spanish/Chinese release 1 have the same characteristics as English release 2.
 * Spanish/Chinese release 2 have the same characteristics as English release 3.
 * @param args
 */
public static void main(String[] args) {
    /*
         * ERE documents in release 2015E29: mainly newswire, some discussion format.
         * This test uses the Event Argument Extraction version of the data, as this includes xml markup that makes
         * the source files well-formed, and we are likely to need this reader for TAC EAE tasks. Moreover, the later
         * ERE release uses this format.
         */
    String corpusDir = "/shared/corpora/corporaWeb/deft/eng/LDC2015E29_DEFT_Rich_ERE_English_Training_Annotation_V2/data/";
    XmlTextAnnotation outputXmlTa = runTest(EreCorpus.ENR1, corpusDir);
    corpusDir = "/shared/corpora/corporaWeb/deft/eng/LDC2015E68_DEFT_Rich_ERE_English_Training_Annotation_R2_V2/data/";
    outputXmlTa = runTest(EreCorpus.ENR2, corpusDir);
    corpusDir = "/shared/corpora/corporaWeb/deft/eng/LDC2016E31_DEFT_Rich_ERE_English_Training_Annotation_R3/data/";
    outputXmlTa = runTest(EreCorpus.ENR3, corpusDir);
    System.out.println("Testing EREMentionRelationReader...");
    StringTransformation xmlSt = outputXmlTa.getXmlSt();
    String origXml = xmlSt.getOrigText();
    List<XmlDocumentProcessor.SpanInfo> markup = outputXmlTa.getXmlMarkup();
    Map<IntPair, XmlDocumentProcessor.SpanInfo> markupInfo = XmlDocumentProcessor.compileOffsetSpanMapping(markup);
    Map<IntPair, Set<String>> markupAttributes = XmlDocumentProcessor.compileAttributeValues(markup);
    Set<String> dateTimeReported = markupAttributes.get(DATETIMEOFFSETS);
    assert (dateTimeReported.contains(DATETIMEVAL));
    assertEquals(DATETIMEVAL, origXml.substring(DATETIMEOFFSETS.getFirst(), DATETIMEOFFSETS.getSecond()));
    // private static final String ORIGAUTHVAL = "tinydancer";
    // private static final IntPair ORIGAUTHOFFSETS = new IntPair(2943, 2953);
    Set<String> origAuth = markupAttributes.get(ORIGAUTHOFFSETS);
    assert (origAuth.contains(ORIGAUTHVAL));
    assertEquals(ORIGAUTHVAL, origXml.substring(ORIGAUTHOFFSETS.getFirst(), ORIGAUTHOFFSETS.getSecond()));
    Set<String> auth = markupAttributes.get(AUTHOROFFSETS);
    assert (auth.contains(AUTHORVAL));
    assertEquals(AUTHORVAL, origXml.substring(AUTHOROFFSETS.getFirst(), AUTHOROFFSETS.getSecond()));
    /*
         * other values recorded at same offsets are not required to be mapped to xml document char offsets.
         * Since this value is not retained in the cleaned text, there is NO CORRESPONDING CONSTITUENT.
         */
    XmlDocumentProcessor.SpanInfo postSpan = markupInfo.get(POSTOFFSETS);
    String mid = postSpan.attributes.get(ENTITY_MENTION_ID).getFirst();
    assertEquals(MENTION_ID_VAL, mid);
    String nt = markupInfo.get(POSTOFFSETS).attributes.get(NOUN_TYPE).getFirst();
    assertEquals(NOUN_TYPE_VAL, nt);
    String eid = markupInfo.get(POSTOFFSETS).attributes.get(ENTITY_ID).getFirst();
    assertEquals(ENTITY_ID_VAL, eid);
    String spec = markupInfo.get(POSTOFFSETS).attributes.get(SPECIFICITY).getFirst();
    assertEquals(SPECIFICITY_VAL, spec);
    assertEquals(QUOTE, markupInfo.get(QUOTEOFFSETS).label);
    String quoteStr = origXml.substring(QUOTEOFFSETS.getFirst(), QUOTEOFFSETS.getSecond());
    assertEquals(QUOTE_VAL, quoteStr);
    String wantedId = "ENG_DF_000170_20150322_F00000082.xml";
    runRelationReader(corpusDir, wantedId);
    wantedId = "ENG_DF_000170_20150322_F00000082.xml";
    runEventReader(corpusDir, wantedId);
    corpusDir = "/shared/corpora/corporaWeb/deft/event/LDC2016E73_TAC_KBP_2016_Eval_Core_Set_Rich_ERE_Annotation_with_Augmented_Event_Argument_v2/data/eng/nw";
    String newWantedId = "ENG_NW_001278_20131206_F00011WGK.xml";
    XmlTextAnnotation xmlTa = runEventReader(corpusDir, newWantedId);
    List<String> output = Collections.singletonList(SerializationHelper.serializeToJson(xmlTa.getTextAnnotation(), true));
    try {
        LineIO.write("ereOut.json", output);
    } catch (IOException e) {
        e.printStackTrace();
    }
}
Also used : StringTransformation(edu.illinois.cs.cogcomp.core.utilities.StringTransformation) XmlDocumentProcessor(edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor) IOException(java.io.IOException) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair)

Aggregations

StringTransformation (edu.illinois.cs.cogcomp.core.utilities.StringTransformation)18 IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)14 Test (org.junit.Test)12 XmlDocumentProcessor (edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor)4 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)3 XmlTextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation)3 IOException (java.io.IOException)2 XmlTextAnnotationMaker (edu.illinois.cs.cogcomp.annotation.XmlTextAnnotationMaker)1 Pair (edu.illinois.cs.cogcomp.core.datastructures.Pair)1 TextCleanerStringTransformation (edu.illinois.cs.cogcomp.core.utilities.TextCleanerStringTransformation)1 EREMentionRelationReader (edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREMentionRelationReader)1 ERENerReader (edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.ERENerReader)1 StatefulTokenizer (edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer)1 TokenizerTextAnnotationBuilder (edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder)1 FileNotFoundException (java.io.FileNotFoundException)1 List (java.util.List)1 Set (java.util.Set)1