use of edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREMentionRelationReader in project cogcomp-nlp by CogComp.
the class CreateTrainDevTestSplit method main.
/**
* read from the cache.
* @param args
*/
public static void main(String[] args) {
if (args.length != 3) {
System.err.println("Usage: " + NAME + " EreCorpusType corpusDir splitDir");
System.exit(-1);
}
EREDocumentReader.EreCorpus ereCorpus = EREDocumentReader.EreCorpus.valueOf(args[0]);
String corpusRoot = args[1];
String outDir = args[2];
ResourceManager fullRm = new CorpusSplitConfigurator().getDefaultConfig();
boolean throwExceptionOnXmlParserFail = false;
double trainFrac = fullRm.getDouble(CorpusSplitConfigurator.TRAIN_FRACTION.key);
double devFrac = fullRm.getDouble(CorpusSplitConfigurator.DEV_FRACTION.key);
double testFrac = fullRm.getDouble(CorpusSplitConfigurator.TEST_FRACTION.key);
// Path corpusPath = Paths.get(corpusRoot);
// String corpusName = corpusPath.getName(corpusPath.getNameCount() - 2).toString();
IOUtils.mkdir(outDir);
String outFileStem = outDir + "/";
//{ViewNames.EVENT_ERE};
String[] viewNames = fullRm.getCommaSeparatedValues(CorpusSplitConfigurator.VIEWS_TO_CONSIDER.key);
String[] labelsToCount = {};
EREMentionRelationReader reader = null;
try {
reader = new EREEventReader(ereCorpus, corpusRoot, throwExceptionOnXmlParserFail);
} catch (Exception e) {
e.printStackTrace();
System.exit(-1);
}
Map<String, XmlTextAnnotation> ereTas = new HashMap<>();
Map<String, Set<View>> ereViews = new HashMap<>();
while (reader.hasNext()) {
XmlTextAnnotation xmlTextAnnotation = reader.next();
ereTas.put(xmlTextAnnotation.getTextAnnotation().getId(), xmlTextAnnotation);
Set<View> views = new HashSet<>();
TextAnnotation ta = xmlTextAnnotation.getTextAnnotation();
for (String viewName : viewNames) if (ta.hasView(viewName))
views.add(ta.getView(viewName));
ereViews.put(ta.getId(), views);
}
CreateTrainDevTestSplit creator = new CreateTrainDevTestSplit(ereViews, labelsToCount);
Map<Split, Set<String>> splits = creator.getSplits(trainFrac, devFrac, testFrac);
Map<Split, Counter<String>> splitCounts = creator.getBestRelSplitCounts();
Map<String, Counter<String>> counts = creator.getLabelCounts();
List<String> outLines = new ArrayList<>(splitCounts.size() + 2);
for (String docId : counts.keySet()) {
outLines.add(docId + ": " + printCounts(counts.get(docId)));
}
for (Split s : splitCounts.keySet()) {
outLines.add(s.name() + ": " + printCounts(splitCounts.get(s)));
}
Counter<String> totalLabelCounts = creator.getLabelTotals();
outLines.add("TOTALS: " + printCounts(totalLabelCounts));
try {
LineIO.write(outFileStem + "countInfo.txt", outLines);
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
}
for (Split s : splits.keySet()) {
List<String> ids = new ArrayList<>(splits.get(s));
try {
LineIO.write(outFileStem + s.name() + ".txt", ids);
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
}
}
}
use of edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREMentionRelationReader in project cogcomp-nlp by CogComp.
the class EREReaderTest method runTest.
private static XmlTextAnnotation runTest(EreCorpus ereCorpus, String corpusRoot) {
ERENerReader nerReader = null;
boolean addNominalMentions = true;
boolean throwExceptionOnXmlTagMismatch = true;
try {
nerReader = new EREMentionRelationReader(ereCorpus, corpusRoot, throwExceptionOnXmlTagMismatch);
} catch (Exception e) {
e.printStackTrace();
System.err.println("ERROR: " + NAME + ": couldn't instantiate ERENerReader for ERE release " + ereCorpus.name() + ": " + e.getMessage());
}
XmlTextAnnotation outputXmlTa = nerReader.next();
TextAnnotation output = outputXmlTa.getTextAnnotation();
View nerEre = null;
if (addNominalMentions) {
assert (output.hasView(ViewNames.MENTION_ERE));
nerEre = output.getView(ViewNames.MENTION_ERE);
} else {
assert (output.hasView(ViewNames.NER_ERE));
nerEre = output.getView(ViewNames.NER_ERE);
}
assert (nerEre.getConstituents().size() > 0);
StringTransformation xmlSt = outputXmlTa.getXmlSt();
String origXmlStr = xmlSt.getOrigText();
System.out.println("ERENerReader found " + nerEre.getConstituents().size() + " NER constituents: ");
for (Constituent c : nerEre.getConstituents()) {
System.out.println(TextAnnotationPrintHelper.printConstituent(c));
int start = c.getStartCharOffset();
int end = c.getEndCharOffset();
IntPair origOffsets = xmlSt.getOriginalOffsets(start, end);
String origStr = origXmlStr.substring(origOffsets.getFirst(), origOffsets.getSecond());
System.out.println("Constituent (clean) text: '" + c.getSurfaceForm() + "'");
System.out.println("Original text: '" + origStr + "'\n---------\n");
}
System.out.println("Report: " + nerReader.generateReport());
return outputXmlTa;
}
use of edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREMentionRelationReader in project cogcomp-nlp by CogComp.
the class EREReaderTest method runRelationReader.
private static XmlTextAnnotation runRelationReader(String corpusDir, String wantedId) {
EREMentionRelationReader emr = null;
try {
boolean throwExceptionOnXmlTagMismatch = true;
emr = new EREMentionRelationReader(EreCorpus.ENR3, corpusDir, throwExceptionOnXmlTagMismatch);
} catch (Exception e) {
e.printStackTrace();
System.exit(-1);
}
assert (emr.hasNext());
String posterId = "TheOldSchool";
XmlTextAnnotation outputXmlTa = null;
do {
outputXmlTa = emr.next();
} while (!outputXmlTa.getTextAnnotation().getId().equals(wantedId) && emr.hasNext());
if (!outputXmlTa.getTextAnnotation().getId().equals(wantedId))
fail("ERROR: didn't find corpus entry with id '" + wantedId + "'.");
TextAnnotation output = outputXmlTa.getTextAnnotation();
assert (output.hasView(ViewNames.MENTION_ERE));
View nerRelation = output.getView(ViewNames.MENTION_ERE);
assert (nerRelation.getConstituents().size() > 0);
System.out.println("EREMentionRelationReader found " + nerRelation.getRelations().size() + " relations: ");
for (Relation r : nerRelation.getRelations()) System.out.println(TextAnnotationPrintHelper.printRelation(r));
String relValue = nerRelation.getRelations().get(0).toString();
assertEquals(RELVALUE, relValue);
System.out.println(TextAnnotationPrintHelper.OUTPUT_SEPARATOR);
System.out.println("ERE Coreference chains:");
assert (output.hasView(ViewNames.COREF_ERE));
CoreferenceView cView = (CoreferenceView) output.getView(ViewNames.COREF_ERE);
assert (cView.getConstituents().size() > 0);
// check no duplicate mentions are added.
Set<IntPair> mentionSpans = new HashSet<>();
for (Constituent c : cView.getConstituents()) {
IntPair cSpan = c.getSpan();
assertFalse(mentionSpans.contains(cSpan));
mentionSpans.add(cSpan);
}
System.out.println(TextAnnotationPrintHelper.printCoreferenceView(cView));
if (doSerialize) {
String jsonStr = SerializationHelper.serializeToJson(output);
try {
LineIO.write("EREsample.json", Collections.singletonList(jsonStr));
} catch (IOException e) {
e.printStackTrace();
fail(e.getMessage());
}
TextAnnotation newTa = null;
try {
newTa = SerializationHelper.deserializeFromJson(jsonStr);
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
assertNotNull(newTa);
}
System.out.println("Report: " + emr.generateReport());
return outputXmlTa;
}
Aggregations