use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class MainClass method test.
private static void test(String modelPath, String testDataPath, boolean updateMatrix) throws Exception {
SLModel model = SLModel.loadModel(modelPath);
((LabeledChuLiuEdmondsDecoder) model.infSolver).loadDepRelDict();
SLProblem sp = getStructuredData(testDataPath, (LabeledChuLiuEdmondsDecoder) model.infSolver);
double acc_undirected = 0.0;
double acc_directed_unlabeled = 0.0;
double acc_labeled = 0.0;
double total = 0.0;
long totalTime = 0L;
int totalLength = 0;
for (int i = 0; i < sp.instanceList.size(); i++) {
DepInst sent = (DepInst) sp.instanceList.get(i);
totalLength += sent.size();
DepStruct gold = (DepStruct) sp.goldStructureList.get(i);
long startTime = System.currentTimeMillis();
DepStruct prediction = (DepStruct) model.infSolver.getBestStructure(model.wv, sent);
totalTime += (System.currentTimeMillis() - startTime);
IntPair tmp_undirected = evaluate(sent, gold, prediction, false, false, false);
IntPair tmp_directed_unlabeled = evaluate(sent, gold, prediction, true, false, false);
IntPair tmp_labeled = evaluate(sent, gold, prediction, true, true, updateMatrix);
acc_undirected += tmp_undirected.getFirst();
acc_directed_unlabeled += tmp_directed_unlabeled.getFirst();
acc_labeled += tmp_labeled.getFirst();
total += tmp_directed_unlabeled.getSecond();
}
System.out.println("Parsing time taken for " + sp.size() + " sentences with average length " + totalLength / sp.size() + ": " + totalTime);
System.out.println("Average parsing time " + totalTime / sp.size());
System.out.println("undirected acc " + acc_undirected);
System.out.println("directed unlabeled acc " + acc_directed_unlabeled);
System.out.println("labeled acc " + acc_labeled);
System.out.println("total " + total);
System.out.println("%age correct undirected " + (acc_undirected * 1.0 / total));
System.out.println("%age correct directed & unlabeled " + (acc_directed_unlabeled * 1.0 / total));
System.out.println("%age correct labeled " + (acc_labeled * 1.0 / total));
if (updateMatrix)
printMatrix();
System.out.println("Done with testing!");
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class ERENerReader method readFiller.
/**
* WARNING: filler can have null value.
*
* @param fillerNode
* @param view
*/
private void readFiller(Node fillerNode, View view, XmlTextAnnotation xmlTa) throws XMLException {
NamedNodeMap nnMap = fillerNode.getAttributes();
String fillerId = nnMap.getNamedItem(ID).getNodeValue();
int offset = Integer.parseInt(nnMap.getNamedItem(OFFSET).getNodeValue());
int length = Integer.parseInt(nnMap.getNamedItem(LENGTH).getNodeValue());
String fillerForm = SimpleXMLParser.getContentString((Element) fillerNode);
String fillerType = nnMap.getNamedItem(TYPE).getNodeValue();
if (null == fillerForm || "".equals(fillerForm))
throw new IllegalStateException("ERROR: did not find surface form for filler " + nnMap.getNamedItem(ID).getNodeValue());
IntPair offsets = getTokenOffsets(offset, offset + length, fillerForm, xmlTa);
if (null != offsets) {
if (-1 == offsets.getFirst() || -1 == offsets.getSecond()) {
String xmlStr = xmlTa.getXmlSt().getOrigText();
int fillerWindowMin = Math.max(offset - 100, 0);
int fillerWindowMax = Math.min(offset + 100, xmlStr.length());
String fillerInfo = "filler form: " + fillerForm + "; orig xml offsets: " + offset + ", " + (offset + length) + "; context: '" + xmlStr.substring(fillerWindowMin, fillerWindowMax) + "'\n";
logger.warn("Couldn't find filler mention in clean text: {}", fillerInfo);
// look in markup...
boolean isFillerFound = recordNullMentionInfo(fillerId, fillerId, "FILLER", fillerNode, true);
if (!isFillerFound)
logger.warn("ERROR: could not find text/xml markup corresponding to filler." + "Since filler should not be an entity, EITHER it was in a quoted span, and therefore " + "should not have been annotated, or it's in a deleted span that should not have been deleted (check" + " EREDocumentReader's use of XmlDocumentProcessor; were the right tags provided at construction?), " + "OR it is from xml markup and the offsets are incorrect (attempted retrieval allowed for +/- 1 char)\n" + "filler info: " + fillerInfo);
// logger.warn("could not create filler with id '{}'", nnMap.getNamedItem(ID)
// .getNodeValue());
} else {
//filler found...
if (offsets.getSecond() < offsets.getFirst())
throw new IllegalStateException("for filler " + fillerId + ", second offset is less than first " + "(first, second:" + offsets.getFirst() + "," + offsets.getSecond() + ").");
Constituent fillerConstituent = new Constituent(fillerType, view.getViewName(), view.getTextAnnotation(), offsets.getFirst(), offsets.getSecond() + 1);
fillerConstituent.addAttribute(EntityMentionIdAttribute, fillerId);
fillerConstituent.addAttribute(EntityMentionTypeAttribute, FILL);
view.addConstituent(fillerConstituent);
mentionIdToConstituent.put(fillerId, fillerConstituent);
}
}
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class EREReaderTest method runRelationReader.
private static XmlTextAnnotation runRelationReader(String corpusDir, String wantedId) {
EREMentionRelationReader emr = null;
try {
boolean throwExceptionOnXmlTagMismatch = true;
emr = new EREMentionRelationReader(EreCorpus.ENR3, corpusDir, throwExceptionOnXmlTagMismatch);
} catch (Exception e) {
e.printStackTrace();
System.exit(-1);
}
assert (emr.hasNext());
String posterId = "TheOldSchool";
XmlTextAnnotation outputXmlTa = null;
do {
outputXmlTa = emr.next();
} while (!outputXmlTa.getTextAnnotation().getId().equals(wantedId) && emr.hasNext());
if (!outputXmlTa.getTextAnnotation().getId().equals(wantedId))
fail("ERROR: didn't find corpus entry with id '" + wantedId + "'.");
TextAnnotation output = outputXmlTa.getTextAnnotation();
assert (output.hasView(ViewNames.MENTION_ERE));
View nerRelation = output.getView(ViewNames.MENTION_ERE);
assert (nerRelation.getConstituents().size() > 0);
System.out.println("EREMentionRelationReader found " + nerRelation.getRelations().size() + " relations: ");
for (Relation r : nerRelation.getRelations()) System.out.println(TextAnnotationPrintHelper.printRelation(r));
String relValue = nerRelation.getRelations().get(0).toString();
assertEquals(RELVALUE, relValue);
System.out.println(TextAnnotationPrintHelper.OUTPUT_SEPARATOR);
System.out.println("ERE Coreference chains:");
assert (output.hasView(ViewNames.COREF_ERE));
CoreferenceView cView = (CoreferenceView) output.getView(ViewNames.COREF_ERE);
assert (cView.getConstituents().size() > 0);
// check no duplicate mentions are added.
Set<IntPair> mentionSpans = new HashSet<>();
for (Constituent c : cView.getConstituents()) {
IntPair cSpan = c.getSpan();
assertFalse(mentionSpans.contains(cSpan));
mentionSpans.add(cSpan);
}
System.out.println(TextAnnotationPrintHelper.printCoreferenceView(cView));
if (doSerialize) {
String jsonStr = SerializationHelper.serializeToJson(output);
try {
LineIO.write("EREsample.json", Collections.singletonList(jsonStr));
} catch (IOException e) {
e.printStackTrace();
fail(e.getMessage());
}
TextAnnotation newTa = null;
try {
newTa = SerializationHelper.deserializeFromJson(jsonStr);
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
assertNotNull(newTa);
}
System.out.println("Report: " + emr.generateReport());
return outputXmlTa;
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class EREReaderTest method main.
//
// "/shared/corpora/corporaWeb/deft/eng/LDC2016E31_DEFT_Rich_ERE_English_Training_Annotation_R3/" +
// "data/source/ENG_DF_001241_20150407_F0000007T.xml";
// public void testNerReader() {
/**
* there are THREE ERE English releases.
* Regrettably, they do not follow consistent standards for organization or for annotation.
*
* LDC2015E29_DEFT_Rich_ERE English V2 has two sets of annotation files: one, used for the Event Argument Extraction
* task in TAC that year, includes a small amount of additional markup to make each xml document well-formed.
* This changes the annotation offsets. Taggable entities within quoted blocks are annotated.
*
* LDC2015E68_DEFT_Rich_ERE_English R2_V2 has as source files excerpts from multi-post discussion forum documents.
* Taggable entities within quoted blocks are annotated.
*
* LDC2016E31_DEFT_Rich_ERE_English ENR3 has -- I believe -- complete threads, where annotation files may be
* broken into several chunks. Taggable entities within quoted blocks are NOT marked.
*
* There are two Spanish and two Chinese ERE releases (aside from a parallel English-Chinese release).
* Spanish/Chinese release 1 have the same characteristics as English release 2.
* Spanish/Chinese release 2 have the same characteristics as English release 3.
* @param args
*/
public static void main(String[] args) {
/*
* ERE documents in release 2015E29: mainly newswire, some discussion format.
* This test uses the Event Argument Extraction version of the data, as this includes xml markup that makes
* the source files well-formed, and we are likely to need this reader for TAC EAE tasks. Moreover, the later
* ERE release uses this format.
*/
String corpusDir = "/shared/corpora/corporaWeb/deft/eng/LDC2015E29_DEFT_Rich_ERE_English_Training_Annotation_V2/data/";
XmlTextAnnotation outputXmlTa = runTest(EreCorpus.ENR1, corpusDir);
corpusDir = "/shared/corpora/corporaWeb/deft/eng/LDC2015E68_DEFT_Rich_ERE_English_Training_Annotation_R2_V2/data/";
outputXmlTa = runTest(EreCorpus.ENR2, corpusDir);
corpusDir = "/shared/corpora/corporaWeb/deft/eng/LDC2016E31_DEFT_Rich_ERE_English_Training_Annotation_R3/data/";
outputXmlTa = runTest(EreCorpus.ENR3, corpusDir);
System.out.println("Testing EREMentionRelationReader...");
StringTransformation xmlSt = outputXmlTa.getXmlSt();
String origXml = xmlSt.getOrigText();
List<XmlDocumentProcessor.SpanInfo> markup = outputXmlTa.getXmlMarkup();
Map<IntPair, XmlDocumentProcessor.SpanInfo> markupInfo = XmlDocumentProcessor.compileOffsetSpanMapping(markup);
Map<IntPair, Set<String>> markupAttributes = XmlDocumentProcessor.compileAttributeValues(markup);
Set<String> dateTimeReported = markupAttributes.get(DATETIMEOFFSETS);
assert (dateTimeReported.contains(DATETIMEVAL));
assertEquals(DATETIMEVAL, origXml.substring(DATETIMEOFFSETS.getFirst(), DATETIMEOFFSETS.getSecond()));
// private static final String ORIGAUTHVAL = "tinydancer";
// private static final IntPair ORIGAUTHOFFSETS = new IntPair(2943, 2953);
Set<String> origAuth = markupAttributes.get(ORIGAUTHOFFSETS);
assert (origAuth.contains(ORIGAUTHVAL));
assertEquals(ORIGAUTHVAL, origXml.substring(ORIGAUTHOFFSETS.getFirst(), ORIGAUTHOFFSETS.getSecond()));
Set<String> auth = markupAttributes.get(AUTHOROFFSETS);
assert (auth.contains(AUTHORVAL));
assertEquals(AUTHORVAL, origXml.substring(AUTHOROFFSETS.getFirst(), AUTHOROFFSETS.getSecond()));
/*
* other values recorded at same offsets are not required to be mapped to xml document char offsets.
* Since this value is not retained in the cleaned text, there is NO CORRESPONDING CONSTITUENT.
*/
XmlDocumentProcessor.SpanInfo postSpan = markupInfo.get(POSTOFFSETS);
String mid = postSpan.attributes.get(ENTITY_MENTION_ID).getFirst();
assertEquals(MENTION_ID_VAL, mid);
String nt = markupInfo.get(POSTOFFSETS).attributes.get(NOUN_TYPE).getFirst();
assertEquals(NOUN_TYPE_VAL, nt);
String eid = markupInfo.get(POSTOFFSETS).attributes.get(ENTITY_ID).getFirst();
assertEquals(ENTITY_ID_VAL, eid);
String spec = markupInfo.get(POSTOFFSETS).attributes.get(SPECIFICITY).getFirst();
assertEquals(SPECIFICITY_VAL, spec);
assertEquals(QUOTE, markupInfo.get(QUOTEOFFSETS).label);
String quoteStr = origXml.substring(QUOTEOFFSETS.getFirst(), QUOTEOFFSETS.getSecond());
assertEquals(QUOTE_VAL, quoteStr);
String wantedId = "ENG_DF_000170_20150322_F00000082.xml";
runRelationReader(corpusDir, wantedId);
wantedId = "ENG_DF_000170_20150322_F00000082.xml";
runEventReader(corpusDir, wantedId);
corpusDir = "/shared/corpora/corporaWeb/deft/event/LDC2016E73_TAC_KBP_2016_Eval_Core_Set_Rich_ERE_Annotation_with_Augmented_Event_Argument_v2/data/eng/nw";
String newWantedId = "ENG_NW_001278_20131206_F00011WGK.xml";
XmlTextAnnotation xmlTa = runEventReader(corpusDir, newWantedId);
}
use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.
the class ERENerReader method getMention.
private Constituent getMention(Node mentionNode, String label, View view, XmlTextAnnotation xmlTa) throws XMLException {
Constituent mentionConstituent = null;
NamedNodeMap nnMap = mentionNode.getAttributes();
String noun_type = nnMap.getNamedItem(NOUN_TYPE).getNodeValue();
String mId = nnMap.getNamedItem(ID).getNodeValue();
if (noun_type.equals(PRO) || noun_type.equals(NOM)) {
if (!addNominalMentions)
return null;
}
/*
* update this count here to avoid creating discrepancy in file count vs created count if
* user does not add nominal mentions
*/
numMentionsInSource++;
// we have a valid mention(a "NAM" or a "NOM"), add it to our view.
/*
* expect one child
*/
NodeList mnl = ((Element) mentionNode).getElementsByTagName(MENTION_TEXT);
String mentionForm = null;
if (mnl.getLength() > 0) {
mentionForm = SimpleXMLParser.getContentString((Element) mnl.item(0));
} else {
logger.error("No surface form found for mention with id {}.", mId);
return null;
}
int offset = Integer.parseInt(nnMap.getNamedItem(OFFSET).getNodeValue());
int length = Integer.parseInt(nnMap.getNamedItem(LENGTH).getNodeValue());
IntPair offsets = getTokenOffsets(offset, offset + length, mentionForm, xmlTa);
if (null == offsets)
return null;
else if (-1 == offsets.getFirst() && -1 == offsets.getSecond()) {
// handled by next layer up, which records the info separately
return null;
}
String headForm = null;
IntPair headTokenOffsets = null;
mnl = ((Element) mentionNode).getElementsByTagName(MENTION_HEAD);
if (mnl.getLength() > 0) {
Node headNode = mnl.item(0);
nnMap = mentionNode.getAttributes();
headForm = headNode.getNodeValue();
int headStart = Integer.parseInt(nnMap.getNamedItem(OFFSET).getNodeValue());
int headLength = Integer.parseInt(nnMap.getNamedItem(LENGTH).getNodeValue());
headTokenOffsets = getTokenOffsets(headStart, headStart + headLength, headForm, xmlTa);
}
if (null == headTokenOffsets)
headTokenOffsets = offsets;
IntPair headCharOffsets = getCharacterOffsets(headTokenOffsets.getFirst(), headTokenOffsets.getSecond());
try {
mentionConstituent = new Constituent(label, view.getViewName(), view.getTextAnnotation(), offsets.getFirst(), offsets.getSecond() + 1);
mentionConstituent.addAttribute(EntityMentionTypeAttribute, noun_type);
mentionConstituent.addAttribute(EntityMentionIdAttribute, mId);
mentionConstituent.addAttribute(EntityHeadStartCharOffset, Integer.toString(headCharOffsets.getFirst()));
mentionConstituent.addAttribute(EntityHeadEndCharOffset, Integer.toString(headCharOffsets.getSecond()));
mentionIdToConstituent.put(mId, mentionConstituent);
} catch (IllegalArgumentException iae) {
numOverlaps++;
}
return mentionConstituent;
}
Aggregations