use of edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREEventReader in project cogcomp-nlp by CogComp.
the class CreateTrainDevTestSplit method main.
/**
* read from the cache.
* @param args
*/
public static void main(String[] args) {
if (args.length != 3) {
System.err.println("Usage: " + NAME + " EreCorpusType corpusDir splitDir");
System.exit(-1);
}
EREDocumentReader.EreCorpus ereCorpus = EREDocumentReader.EreCorpus.valueOf(args[0]);
String corpusRoot = args[1];
String outDir = args[2];
ResourceManager fullRm = new CorpusSplitConfigurator().getDefaultConfig();
boolean throwExceptionOnXmlParserFail = false;
double trainFrac = fullRm.getDouble(CorpusSplitConfigurator.TRAIN_FRACTION.key);
double devFrac = fullRm.getDouble(CorpusSplitConfigurator.DEV_FRACTION.key);
double testFrac = fullRm.getDouble(CorpusSplitConfigurator.TEST_FRACTION.key);
// Path corpusPath = Paths.get(corpusRoot);
// String corpusName = corpusPath.getName(corpusPath.getNameCount() - 2).toString();
IOUtils.mkdir(outDir);
String outFileStem = outDir + "/";
//{ViewNames.EVENT_ERE};
String[] viewNames = fullRm.getCommaSeparatedValues(CorpusSplitConfigurator.VIEWS_TO_CONSIDER.key);
String[] labelsToCount = {};
EREMentionRelationReader reader = null;
try {
reader = new EREEventReader(ereCorpus, corpusRoot, throwExceptionOnXmlParserFail);
} catch (Exception e) {
e.printStackTrace();
System.exit(-1);
}
Map<String, XmlTextAnnotation> ereTas = new HashMap<>();
Map<String, Set<View>> ereViews = new HashMap<>();
while (reader.hasNext()) {
XmlTextAnnotation xmlTextAnnotation = reader.next();
ereTas.put(xmlTextAnnotation.getTextAnnotation().getId(), xmlTextAnnotation);
Set<View> views = new HashSet<>();
TextAnnotation ta = xmlTextAnnotation.getTextAnnotation();
for (String viewName : viewNames) if (ta.hasView(viewName))
views.add(ta.getView(viewName));
ereViews.put(ta.getId(), views);
}
CreateTrainDevTestSplit creator = new CreateTrainDevTestSplit(ereViews, labelsToCount);
Map<Split, Set<String>> splits = creator.getSplits(trainFrac, devFrac, testFrac);
Map<Split, Counter<String>> splitCounts = creator.getBestRelSplitCounts();
Map<String, Counter<String>> counts = creator.getLabelCounts();
List<String> outLines = new ArrayList<>(splitCounts.size() + 2);
for (String docId : counts.keySet()) {
outLines.add(docId + ": " + printCounts(counts.get(docId)));
}
for (Split s : splitCounts.keySet()) {
outLines.add(s.name() + ": " + printCounts(splitCounts.get(s)));
}
Counter<String> totalLabelCounts = creator.getLabelTotals();
outLines.add("TOTALS: " + printCounts(totalLabelCounts));
try {
LineIO.write(outFileStem + "countInfo.txt", outLines);
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
}
for (Split s : splits.keySet()) {
List<String> ids = new ArrayList<>(splits.get(s));
try {
LineIO.write(outFileStem + s.name() + ".txt", ids);
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
}
}
}
use of edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREEventReader in project cogcomp-nlp by CogComp.
the class MultilingualEreReaderTest method testChinese.
public static void testChinese() {
EREEventReader reader = null;
try {
boolean throwExceptionOnXmlParseFail = true;
TextAnnotationBuilder chineseTaBldr = MultiLingualTokenizer.getTokenizer(Language.Chinese.getCode());
reader = new EREEventReader(EREDocumentReader.EreCorpus.ENR3, chineseTaBldr, chinesePathB, throwExceptionOnXmlParseFail);
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
testReader(reader);
}
use of edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREEventReader in project cogcomp-nlp by CogComp.
the class EREReaderTest method runEventReader.
private static XmlTextAnnotation runEventReader(String corpusDir, String wantedId) {
EREEventReader emr = null;
try {
boolean throwExceptionOnXmlTagMismatch = true;
emr = new EREEventReader(EreCorpus.ENR3, corpusDir, throwExceptionOnXmlTagMismatch);
} catch (Exception e) {
e.printStackTrace();
System.exit(-1);
}
assert (emr.hasNext());
XmlTextAnnotation outputXmlTa = null;
do {
outputXmlTa = emr.next();
} while (!outputXmlTa.getTextAnnotation().getId().equals(wantedId) && emr.hasNext());
if (!outputXmlTa.getTextAnnotation().getId().equals(wantedId))
fail("ERROR: didn't find corpus entry with id '" + wantedId + "'.");
TextAnnotation output = outputXmlTa.getTextAnnotation();
assert (output.hasView(ViewNames.MENTION_ERE));
View nerRelation = output.getView(ViewNames.MENTION_ERE);
assert (nerRelation.getConstituents().size() > 0);
assert (output.hasView(ViewNames.EVENT_ERE));
PredicateArgumentView eventView = (PredicateArgumentView) output.getView(emr.getEventViewName());
assert (eventView.getConstituents().size() > 0);
List<Constituent> triggers = eventView.getPredicates();
assert (triggers.size() > 0);
List<Relation> args = eventView.getArguments(triggers.get(0));
assert (args.get(0).getAttribute(ORIGIN) != null);
assert (args.get(0).getAttribute(REALIS) != null);
System.out.println(eventView.toString());
String report = emr.generateReport();
System.out.println("Event Reader report:\n\n" + report);
return outputXmlTa;
}
use of edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREEventReader in project cogcomp-nlp by CogComp.
the class MultilingualEreReaderTest method testSpanish.
public static void testSpanish() {
// String lang = "es"; //spanish
// TextAnnotationBuilder taBldr = MultiLingualTokenizer.getTokenizer(lang);
EREEventReader reader = null;
try {
boolean throwExceptionOnXmlParseFail = true;
TextAnnotationBuilder spanishTaBldr = MultiLingualTokenizer.getTokenizer(Language.Spanish.getCode());
reader = new EREEventReader(EREDocumentReader.EreCorpus.ENR2, spanishTaBldr, spanishPathA, throwExceptionOnXmlParseFail);
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
testReader(reader);
}
Aggregations