use of edu.illinois.cs.cogcomp.core.utilities.StringTransformation in project cogcomp-nlp by CogComp.
the class TACReaderTest method main.
public static void main(String[] args) {
TACReader tacReader = null;
try {
tacReader = new TACReader(CORPUS_ROOT, true);
} catch (Exception e) {
e.printStackTrace();
System.err.println("ERROR: " + NAME + ": couldn't instantiate TACReader: " + e.getMessage());
}
String wantedId = "ENG_NW_001278_20130318_F00012HTB.xml";
XmlTextAnnotation outputXmlTa = null;
do {
try {
outputXmlTa = tacReader.next();
} catch (IllegalStateException e) {
e.printStackTrace();
}
} while (!outputXmlTa.getTextAnnotation().getId().equals(wantedId) && tacReader.hasNext());
if (!outputXmlTa.getTextAnnotation().getId().equals(wantedId))
fail("ERROR: didn't find corpus entry with id '" + wantedId + "'.");
TextAnnotation output = outputXmlTa.getTextAnnotation();
StringTransformation xmlSt = outputXmlTa.getXmlSt();
String origXml = xmlSt.getOrigText();
List<XmlDocumentProcessor.SpanInfo> markup = outputXmlTa.getXmlMarkup();
Map<IntPair, XmlDocumentProcessor.SpanInfo> markupInfo = XmlDocumentProcessor.compileOffsetSpanMapping(markup);
Map<IntPair, Set<String>> markupAttributes = XmlDocumentProcessor.compileAttributeValues(markup);
Set<String> docIdReported = markupAttributes.get(IDOFFSETS);
assert (docIdReported.contains(ID));
assertEquals(DATETIMEVAL, origXml.substring(DATETIMEOFFSETS.getFirst(), DATETIMEOFFSETS.getSecond()));
assertEquals(AUTHORVAL, origXml.substring(AUTHOROFFSETS.getFirst(), AUTHOROFFSETS.getSecond()));
}
use of edu.illinois.cs.cogcomp.core.utilities.StringTransformation in project cogcomp-nlp by CogComp.
the class XmlTextAnnotationMakerOntonotesTest method testNestedNames.
/**
* the edit offsets get messed up when there are nested tags.
*/
@Test
public void testNestedNames() {
String text = "He spoke with Paul <ENAMEX TYPE=\"PERSON\"><ENAMEX TYPE=\"PERSON\" E_OFF=\"1\">Paula</ENAMEX> Zahn</ENAMEX> .";
// we keep everything.
XmlDocumentProcessor xmlProcessor = new XmlDocumentProcessor(tagsWithText, tagsWithAtts, dropTags, true);
StatefulTokenizer st = new StatefulTokenizer();
TokenizerTextAnnotationBuilder taBuilder = new TokenizerTextAnnotationBuilder(st);
XmlTextAnnotationMaker xtam = new XmlTextAnnotationMaker(taBuilder, xmlProcessor);
// read the file and create the annotation.
XmlTextAnnotation xta = xtam.createTextAnnotation(text, "OntoNotes 5.0", "test");
TextAnnotation ta = xta.getTextAnnotation();
List<XmlDocumentProcessor.SpanInfo> fudge = xta.getXmlMarkup();
StringTransformation xst = xta.getXmlSt();
for (XmlDocumentProcessor.SpanInfo si : fudge) {
int newTextStart = xst.computeModifiedOffsetFromOriginal(si.spanOffsets.getFirst());
int newTextEnd = xst.computeModifiedOffsetFromOriginal(si.spanOffsets.getSecond());
String neStr = ta.getText().substring(newTextStart, newTextEnd);
assertTrue(REF_ENTITIES.contains(neStr));
}
}
use of edu.illinois.cs.cogcomp.core.utilities.StringTransformation in project cogcomp-nlp by CogComp.
the class EREReaderTest method main.
//
// "/shared/corpora/corporaWeb/deft/eng/LDC2016E31_DEFT_Rich_ERE_English_Training_Annotation_R3/" +
// "data/source/ENG_DF_001241_20150407_F0000007T.xml";
// public void testNerReader() {
/**
* there are THREE ERE English releases.
* Regrettably, they do not follow consistent standards for organization or for annotation.
*
* LDC2015E29_DEFT_Rich_ERE English V2 has two sets of annotation files: one, used for the Event Argument Extraction
* task in TAC that year, includes a small amount of additional markup to make each xml document well-formed.
* This changes the annotation offsets. Taggable entities within quoted blocks are annotated.
*
* LDC2015E68_DEFT_Rich_ERE_English R2_V2 has as source files excerpts from multi-post discussion forum documents.
* Taggable entities within quoted blocks are annotated.
*
* LDC2016E31_DEFT_Rich_ERE_English ENR3 has -- I believe -- complete threads, where annotation files may be
* broken into several chunks. Taggable entities within quoted blocks are NOT marked.
*
* There are two Spanish and two Chinese ERE releases (aside from a parallel English-Chinese release).
* Spanish/Chinese release 1 have the same characteristics as English release 2.
* Spanish/Chinese release 2 have the same characteristics as English release 3.
* @param args
*/
public static void main(String[] args) {
/*
* ERE documents in release 2015E29: mainly newswire, some discussion format.
* This test uses the Event Argument Extraction version of the data, as this includes xml markup that makes
* the source files well-formed, and we are likely to need this reader for TAC EAE tasks. Moreover, the later
* ERE release uses this format.
*/
String corpusDir = "/shared/corpora/corporaWeb/deft/eng/LDC2015E29_DEFT_Rich_ERE_English_Training_Annotation_V2/data/";
XmlTextAnnotation outputXmlTa = runTest(EreCorpus.ENR1, corpusDir);
corpusDir = "/shared/corpora/corporaWeb/deft/eng/LDC2015E68_DEFT_Rich_ERE_English_Training_Annotation_R2_V2/data/";
outputXmlTa = runTest(EreCorpus.ENR2, corpusDir);
corpusDir = "/shared/corpora/corporaWeb/deft/eng/LDC2016E31_DEFT_Rich_ERE_English_Training_Annotation_R3/data/";
outputXmlTa = runTest(EreCorpus.ENR3, corpusDir);
System.out.println("Testing EREMentionRelationReader...");
StringTransformation xmlSt = outputXmlTa.getXmlSt();
String origXml = xmlSt.getOrigText();
List<XmlDocumentProcessor.SpanInfo> markup = outputXmlTa.getXmlMarkup();
Map<IntPair, XmlDocumentProcessor.SpanInfo> markupInfo = XmlDocumentProcessor.compileOffsetSpanMapping(markup);
Map<IntPair, Set<String>> markupAttributes = XmlDocumentProcessor.compileAttributeValues(markup);
Set<String> dateTimeReported = markupAttributes.get(DATETIMEOFFSETS);
assert (dateTimeReported.contains(DATETIMEVAL));
assertEquals(DATETIMEVAL, origXml.substring(DATETIMEOFFSETS.getFirst(), DATETIMEOFFSETS.getSecond()));
// private static final String ORIGAUTHVAL = "tinydancer";
// private static final IntPair ORIGAUTHOFFSETS = new IntPair(2943, 2953);
Set<String> origAuth = markupAttributes.get(ORIGAUTHOFFSETS);
assert (origAuth.contains(ORIGAUTHVAL));
assertEquals(ORIGAUTHVAL, origXml.substring(ORIGAUTHOFFSETS.getFirst(), ORIGAUTHOFFSETS.getSecond()));
Set<String> auth = markupAttributes.get(AUTHOROFFSETS);
assert (auth.contains(AUTHORVAL));
assertEquals(AUTHORVAL, origXml.substring(AUTHOROFFSETS.getFirst(), AUTHOROFFSETS.getSecond()));
/*
* other values recorded at same offsets are not required to be mapped to xml document char offsets.
* Since this value is not retained in the cleaned text, there is NO CORRESPONDING CONSTITUENT.
*/
XmlDocumentProcessor.SpanInfo postSpan = markupInfo.get(POSTOFFSETS);
String mid = postSpan.attributes.get(ENTITY_MENTION_ID).getFirst();
assertEquals(MENTION_ID_VAL, mid);
String nt = markupInfo.get(POSTOFFSETS).attributes.get(NOUN_TYPE).getFirst();
assertEquals(NOUN_TYPE_VAL, nt);
String eid = markupInfo.get(POSTOFFSETS).attributes.get(ENTITY_ID).getFirst();
assertEquals(ENTITY_ID_VAL, eid);
String spec = markupInfo.get(POSTOFFSETS).attributes.get(SPECIFICITY).getFirst();
assertEquals(SPECIFICITY_VAL, spec);
assertEquals(QUOTE, markupInfo.get(QUOTEOFFSETS).label);
String quoteStr = origXml.substring(QUOTEOFFSETS.getFirst(), QUOTEOFFSETS.getSecond());
assertEquals(QUOTE_VAL, quoteStr);
String wantedId = "ENG_DF_000170_20150322_F00000082.xml";
runRelationReader(corpusDir, wantedId);
wantedId = "ENG_DF_000170_20150322_F00000082.xml";
runEventReader(corpusDir, wantedId);
corpusDir = "/shared/corpora/corporaWeb/deft/event/LDC2016E73_TAC_KBP_2016_Eval_Core_Set_Rich_ERE_Annotation_with_Augmented_Event_Argument_v2/data/eng/nw";
String newWantedId = "ENG_NW_001278_20131206_F00011WGK.xml";
XmlTextAnnotation xmlTa = runEventReader(corpusDir, newWantedId);
List<String> output = Collections.singletonList(SerializationHelper.serializeToJson(xmlTa.getTextAnnotation(), true));
try {
LineIO.write("ereOut.json", output);
} catch (IOException e) {
e.printStackTrace();
}
}
Aggregations