use of edu.illinois.cs.cogcomp.core.utilities.StringTransformation in project cogcomp-nlp by CogComp.
the class EREReaderTest method runTest.
private static XmlTextAnnotation runTest(EreCorpus ereCorpus, String corpusRoot) {
ERENerReader nerReader = null;
boolean addNominalMentions = true;
boolean throwExceptionOnXmlTagMismatch = true;
try {
nerReader = new EREMentionRelationReader(ereCorpus, corpusRoot, throwExceptionOnXmlTagMismatch);
} catch (Exception e) {
e.printStackTrace();
System.err.println("ERROR: " + NAME + ": couldn't instantiate ERENerReader for ERE release " + ereCorpus.name() + ": " + e.getMessage());
}
XmlTextAnnotation outputXmlTa = nerReader.next();
TextAnnotation output = outputXmlTa.getTextAnnotation();
View nerEre = null;
if (addNominalMentions) {
assert (output.hasView(ViewNames.MENTION_ERE));
nerEre = output.getView(ViewNames.MENTION_ERE);
} else {
assert (output.hasView(ViewNames.NER_ERE));
nerEre = output.getView(ViewNames.NER_ERE);
}
assert (nerEre.getConstituents().size() > 0);
StringTransformation xmlSt = outputXmlTa.getXmlSt();
String origXmlStr = xmlSt.getOrigText();
System.out.println("ERENerReader found " + nerEre.getConstituents().size() + " NER constituents: ");
for (Constituent c : nerEre.getConstituents()) {
System.out.println(TextAnnotationPrintHelper.printConstituent(c));
int start = c.getStartCharOffset();
int end = c.getEndCharOffset();
IntPair origOffsets = xmlSt.getOriginalOffsets(start, end);
String origStr = origXmlStr.substring(origOffsets.getFirst(), origOffsets.getSecond());
System.out.println("Constituent (clean) text: '" + c.getSurfaceForm() + "'");
System.out.println("Original text: '" + origStr + "'\n---------\n");
}
System.out.println("Report: " + nerReader.generateReport());
return outputXmlTa;
}
use of edu.illinois.cs.cogcomp.core.utilities.StringTransformation in project cogcomp-nlp by CogComp.
the class XmlTextAnnotationMaker method createTextAnnotation.
/**
* A method for creating
* {@link TextAnnotation} by
* tokenizing the given text string.
*
* @param xmlText Raw xml text from corpus docuemnt
* @param corpusId corpus identifier
* @param docId text identifier
* @return an XmlTextAnnotation with the cleaned text (StringTransformation), TextAnnotation for
* the cleaned text, and xml markup extracted from source
*/
public XmlTextAnnotation createTextAnnotation(String xmlText, String corpusId, String docId) {
logger.info("processing text from document {}", docId);
Pair<StringTransformation, List<XmlDocumentProcessor.SpanInfo>> cleanResults = xmlProcessor.processXml(xmlText);
TextAnnotation ta = taBuilder.createTextAnnotation(corpusId, docId, cleanResults.getFirst().getTransformedText());
return new XmlTextAnnotation(cleanResults.getFirst(), ta, cleanResults.getSecond());
}
use of edu.illinois.cs.cogcomp.core.utilities.StringTransformation in project cogcomp-nlp by CogComp.
the class EREReaderTest method main.
//
// "/shared/corpora/corporaWeb/deft/eng/LDC2016E31_DEFT_Rich_ERE_English_Training_Annotation_R3/" +
// "data/source/ENG_DF_001241_20150407_F0000007T.xml";
// public void testNerReader() {
/**
* there are THREE ERE English releases.
* Regrettably, they do not follow consistent standards for organization or for annotation.
*
* LDC2015E29_DEFT_Rich_ERE English V2 has two sets of annotation files: one, used for the Event Argument Extraction
* task in TAC that year, includes a small amount of additional markup to make each xml document well-formed.
* This changes the annotation offsets. Taggable entities within quoted blocks are annotated.
*
* LDC2015E68_DEFT_Rich_ERE_English R2_V2 has as source files excerpts from multi-post discussion forum documents.
* Taggable entities within quoted blocks are annotated.
*
* LDC2016E31_DEFT_Rich_ERE_English ENR3 has -- I believe -- complete threads, where annotation files may be
* broken into several chunks. Taggable entities within quoted blocks are NOT marked.
*
* There are two Spanish and two Chinese ERE releases (aside from a parallel English-Chinese release).
* Spanish/Chinese release 1 have the same characteristics as English release 2.
* Spanish/Chinese release 2 have the same characteristics as English release 3.
* @param args
*/
public static void main(String[] args) {
/*
* ERE documents in release 2015E29: mainly newswire, some discussion format.
* This test uses the Event Argument Extraction version of the data, as this includes xml markup that makes
* the source files well-formed, and we are likely to need this reader for TAC EAE tasks. Moreover, the later
* ERE release uses this format.
*/
String corpusDir = "/shared/corpora/corporaWeb/deft/eng/LDC2015E29_DEFT_Rich_ERE_English_Training_Annotation_V2/data/";
XmlTextAnnotation outputXmlTa = runTest(EreCorpus.ENR1, corpusDir);
corpusDir = "/shared/corpora/corporaWeb/deft/eng/LDC2015E68_DEFT_Rich_ERE_English_Training_Annotation_R2_V2/data/";
outputXmlTa = runTest(EreCorpus.ENR2, corpusDir);
corpusDir = "/shared/corpora/corporaWeb/deft/eng/LDC2016E31_DEFT_Rich_ERE_English_Training_Annotation_R3/data/";
outputXmlTa = runTest(EreCorpus.ENR3, corpusDir);
System.out.println("Testing EREMentionRelationReader...");
StringTransformation xmlSt = outputXmlTa.getXmlSt();
String origXml = xmlSt.getOrigText();
List<XmlDocumentProcessor.SpanInfo> markup = outputXmlTa.getXmlMarkup();
Map<IntPair, XmlDocumentProcessor.SpanInfo> markupInfo = XmlDocumentProcessor.compileOffsetSpanMapping(markup);
Map<IntPair, Set<String>> markupAttributes = XmlDocumentProcessor.compileAttributeValues(markup);
Set<String> dateTimeReported = markupAttributes.get(DATETIMEOFFSETS);
assert (dateTimeReported.contains(DATETIMEVAL));
assertEquals(DATETIMEVAL, origXml.substring(DATETIMEOFFSETS.getFirst(), DATETIMEOFFSETS.getSecond()));
// private static final String ORIGAUTHVAL = "tinydancer";
// private static final IntPair ORIGAUTHOFFSETS = new IntPair(2943, 2953);
Set<String> origAuth = markupAttributes.get(ORIGAUTHOFFSETS);
assert (origAuth.contains(ORIGAUTHVAL));
assertEquals(ORIGAUTHVAL, origXml.substring(ORIGAUTHOFFSETS.getFirst(), ORIGAUTHOFFSETS.getSecond()));
Set<String> auth = markupAttributes.get(AUTHOROFFSETS);
assert (auth.contains(AUTHORVAL));
assertEquals(AUTHORVAL, origXml.substring(AUTHOROFFSETS.getFirst(), AUTHOROFFSETS.getSecond()));
/*
* other values recorded at same offsets are not required to be mapped to xml document char offsets.
* Since this value is not retained in the cleaned text, there is NO CORRESPONDING CONSTITUENT.
*/
XmlDocumentProcessor.SpanInfo postSpan = markupInfo.get(POSTOFFSETS);
String mid = postSpan.attributes.get(ENTITY_MENTION_ID).getFirst();
assertEquals(MENTION_ID_VAL, mid);
String nt = markupInfo.get(POSTOFFSETS).attributes.get(NOUN_TYPE).getFirst();
assertEquals(NOUN_TYPE_VAL, nt);
String eid = markupInfo.get(POSTOFFSETS).attributes.get(ENTITY_ID).getFirst();
assertEquals(ENTITY_ID_VAL, eid);
String spec = markupInfo.get(POSTOFFSETS).attributes.get(SPECIFICITY).getFirst();
assertEquals(SPECIFICITY_VAL, spec);
assertEquals(QUOTE, markupInfo.get(QUOTEOFFSETS).label);
String quoteStr = origXml.substring(QUOTEOFFSETS.getFirst(), QUOTEOFFSETS.getSecond());
assertEquals(QUOTE_VAL, quoteStr);
String wantedId = "ENG_DF_000170_20150322_F00000082.xml";
runRelationReader(corpusDir, wantedId);
wantedId = "ENG_DF_000170_20150322_F00000082.xml";
runEventReader(corpusDir, wantedId);
corpusDir = "/shared/corpora/corporaWeb/deft/event/LDC2016E73_TAC_KBP_2016_Eval_Core_Set_Rich_ERE_Annotation_with_Augmented_Event_Argument_v2/data/eng/nw";
String newWantedId = "ENG_NW_001278_20131206_F00011WGK.xml";
XmlTextAnnotation xmlTa = runEventReader(corpusDir, newWantedId);
}
use of edu.illinois.cs.cogcomp.core.utilities.StringTransformation in project cogcomp-nlp by CogComp.
the class StringTransformationTest method testReplace.
@Test
public void testReplace() {
StringTransformation st = new StringTransformation(REPLACE);
st.transformString(4, 5, "'");
st.transformString(14, 15, "-");
String modifiedStr = st.getTransformedText();
assertEquals(REPLACE, st.getOrigText());
assertEquals(REPLACE.length(), modifiedStr.length());
assertEquals(MODREPLACE, modifiedStr);
int modStart = st.computeModifiedOffsetFromOriginal(14);
int modEnd = st.computeModifiedOffsetFromOriginal(15);
assertEquals(14, modStart);
assertEquals(15, modEnd);
IntPair origOffsets = st.getOriginalOffsets(4, 5);
assertEquals(4, origOffsets.getFirst());
assertEquals(5, origOffsets.getSecond());
origOffsets = st.getOriginalOffsets(14, 15);
assertEquals(14, origOffsets.getFirst());
assertEquals(15, origOffsets.getSecond());
}
use of edu.illinois.cs.cogcomp.core.utilities.StringTransformation in project cogcomp-nlp by CogComp.
the class StringTransformationTest method testExpand.
@Test
public void testExpand() {
StringTransformation st = new StringTransformation(EXPAND);
st.transformString(4, 5, "``");
st.transformString(9, 10, "-RCB-");
String modifiedStr = st.getTransformedText();
assertEquals(EXPAND, st.getOrigText());
assertEquals(EXPAND.length() + 5, modifiedStr.length());
assertEquals(MODEXPAND, modifiedStr);
IntPair origOffsets = st.getOriginalOffsets(4, 6);
assertEquals(4, origOffsets.getFirst());
assertEquals(5, origOffsets.getSecond());
origOffsets = st.getOriginalOffsets(10, 15);
assertEquals(9, origOffsets.getFirst());
assertEquals(10, origOffsets.getSecond());
int modStart = st.computeModifiedOffsetFromOriginal(9);
int modEnd = st.computeModifiedOffsetFromOriginal(10);
assertEquals(10, modStart);
assertEquals(15, modEnd);
}
Aggregations