Search in sources :

Example 1 with EREMentionRelationReader

use of edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREMentionRelationReader in project cogcomp-nlp by CogComp.

the class CreateTrainDevTestSplit method main.

/**
     * read from the cache.
     * @param args
     */
public static void main(String[] args) {
    if (args.length != 3) {
        System.err.println("Usage: " + NAME + " EreCorpusType corpusDir splitDir");
        System.exit(-1);
    }
    EREDocumentReader.EreCorpus ereCorpus = EREDocumentReader.EreCorpus.valueOf(args[0]);
    String corpusRoot = args[1];
    String outDir = args[2];
    ResourceManager fullRm = new CorpusSplitConfigurator().getDefaultConfig();
    boolean throwExceptionOnXmlParserFail = false;
    double trainFrac = fullRm.getDouble(CorpusSplitConfigurator.TRAIN_FRACTION.key);
    double devFrac = fullRm.getDouble(CorpusSplitConfigurator.DEV_FRACTION.key);
    double testFrac = fullRm.getDouble(CorpusSplitConfigurator.TEST_FRACTION.key);
    //        Path corpusPath = Paths.get(corpusRoot);
    //        String corpusName = corpusPath.getName(corpusPath.getNameCount() - 2).toString();
    IOUtils.mkdir(outDir);
    String outFileStem = outDir + "/";
    //{ViewNames.EVENT_ERE};
    String[] viewNames = fullRm.getCommaSeparatedValues(CorpusSplitConfigurator.VIEWS_TO_CONSIDER.key);
    String[] labelsToCount = {};
    EREMentionRelationReader reader = null;
    try {
        reader = new EREEventReader(ereCorpus, corpusRoot, throwExceptionOnXmlParserFail);
    } catch (Exception e) {
        e.printStackTrace();
        System.exit(-1);
    }
    Map<String, XmlTextAnnotation> ereTas = new HashMap<>();
    Map<String, Set<View>> ereViews = new HashMap<>();
    while (reader.hasNext()) {
        XmlTextAnnotation xmlTextAnnotation = reader.next();
        ereTas.put(xmlTextAnnotation.getTextAnnotation().getId(), xmlTextAnnotation);
        Set<View> views = new HashSet<>();
        TextAnnotation ta = xmlTextAnnotation.getTextAnnotation();
        for (String viewName : viewNames) if (ta.hasView(viewName))
            views.add(ta.getView(viewName));
        ereViews.put(ta.getId(), views);
    }
    CreateTrainDevTestSplit creator = new CreateTrainDevTestSplit(ereViews, labelsToCount);
    Map<Split, Set<String>> splits = creator.getSplits(trainFrac, devFrac, testFrac);
    Map<Split, Counter<String>> splitCounts = creator.getBestRelSplitCounts();
    Map<String, Counter<String>> counts = creator.getLabelCounts();
    List<String> outLines = new ArrayList<>(splitCounts.size() + 2);
    for (String docId : counts.keySet()) {
        outLines.add(docId + ": " + printCounts(counts.get(docId)));
    }
    for (Split s : splitCounts.keySet()) {
        outLines.add(s.name() + ": " + printCounts(splitCounts.get(s)));
    }
    Counter<String> totalLabelCounts = creator.getLabelTotals();
    outLines.add("TOTALS: " + printCounts(totalLabelCounts));
    try {
        LineIO.write(outFileStem + "countInfo.txt", outLines);
    } catch (IOException e) {
        e.printStackTrace();
        System.exit(-1);
    }
    for (Split s : splits.keySet()) {
        List<String> ids = new ArrayList<>(splits.get(s));
        try {
            LineIO.write(outFileStem + s.name() + ".txt", ids);
        } catch (IOException e) {
            e.printStackTrace();
            System.exit(-1);
        }
    }
}
Also used : EREDocumentReader(edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREDocumentReader) Counter(edu.illinois.cs.cogcomp.core.stats.Counter) ResourceManager(edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager) IOException(java.io.IOException) IOException(java.io.IOException) EREMentionRelationReader(edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREMentionRelationReader) EREEventReader(edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREEventReader)

Example 2 with EREMentionRelationReader

use of edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREMentionRelationReader in project cogcomp-nlp by CogComp.

the class EREReaderTest method runTest.

private static XmlTextAnnotation runTest(EreCorpus ereCorpus, String corpusRoot) {
    ERENerReader nerReader = null;
    boolean addNominalMentions = true;
    boolean throwExceptionOnXmlTagMismatch = true;
    try {
        nerReader = new EREMentionRelationReader(ereCorpus, corpusRoot, throwExceptionOnXmlTagMismatch);
    } catch (Exception e) {
        e.printStackTrace();
        System.err.println("ERROR: " + NAME + ": couldn't instantiate ERENerReader for ERE release " + ereCorpus.name() + ": " + e.getMessage());
    }
    XmlTextAnnotation outputXmlTa = nerReader.next();
    TextAnnotation output = outputXmlTa.getTextAnnotation();
    View nerEre = null;
    if (addNominalMentions) {
        assert (output.hasView(ViewNames.MENTION_ERE));
        nerEre = output.getView(ViewNames.MENTION_ERE);
    } else {
        assert (output.hasView(ViewNames.NER_ERE));
        nerEre = output.getView(ViewNames.NER_ERE);
    }
    assert (nerEre.getConstituents().size() > 0);
    StringTransformation xmlSt = outputXmlTa.getXmlSt();
    String origXmlStr = xmlSt.getOrigText();
    System.out.println("ERENerReader found " + nerEre.getConstituents().size() + " NER constituents: ");
    for (Constituent c : nerEre.getConstituents()) {
        System.out.println(TextAnnotationPrintHelper.printConstituent(c));
        int start = c.getStartCharOffset();
        int end = c.getEndCharOffset();
        IntPair origOffsets = xmlSt.getOriginalOffsets(start, end);
        String origStr = origXmlStr.substring(origOffsets.getFirst(), origOffsets.getSecond());
        System.out.println("Constituent (clean) text: '" + c.getSurfaceForm() + "'");
        System.out.println("Original text: '" + origStr + "'\n---------\n");
    }
    System.out.println("Report: " + nerReader.generateReport());
    return outputXmlTa;
}
Also used : EREMentionRelationReader(edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREMentionRelationReader) ERENerReader(edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.ERENerReader) StringTransformation(edu.illinois.cs.cogcomp.core.utilities.StringTransformation) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) IOException(java.io.IOException)

Example 3 with EREMentionRelationReader

use of edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREMentionRelationReader in project cogcomp-nlp by CogComp.

the class EREReaderTest method runRelationReader.

private static XmlTextAnnotation runRelationReader(String corpusDir, String wantedId) {
    EREMentionRelationReader emr = null;
    try {
        boolean throwExceptionOnXmlTagMismatch = true;
        emr = new EREMentionRelationReader(EreCorpus.ENR3, corpusDir, throwExceptionOnXmlTagMismatch);
    } catch (Exception e) {
        e.printStackTrace();
        System.exit(-1);
    }
    assert (emr.hasNext());
    String posterId = "TheOldSchool";
    XmlTextAnnotation outputXmlTa = null;
    do {
        outputXmlTa = emr.next();
    } while (!outputXmlTa.getTextAnnotation().getId().equals(wantedId) && emr.hasNext());
    if (!outputXmlTa.getTextAnnotation().getId().equals(wantedId))
        fail("ERROR: didn't find corpus entry with id '" + wantedId + "'.");
    TextAnnotation output = outputXmlTa.getTextAnnotation();
    assert (output.hasView(ViewNames.MENTION_ERE));
    View nerRelation = output.getView(ViewNames.MENTION_ERE);
    assert (nerRelation.getConstituents().size() > 0);
    System.out.println("EREMentionRelationReader found " + nerRelation.getRelations().size() + " relations: ");
    for (Relation r : nerRelation.getRelations()) System.out.println(TextAnnotationPrintHelper.printRelation(r));
    String relValue = nerRelation.getRelations().get(0).toString();
    assertEquals(RELVALUE, relValue);
    System.out.println(TextAnnotationPrintHelper.OUTPUT_SEPARATOR);
    System.out.println("ERE Coreference chains:");
    assert (output.hasView(ViewNames.COREF_ERE));
    CoreferenceView cView = (CoreferenceView) output.getView(ViewNames.COREF_ERE);
    assert (cView.getConstituents().size() > 0);
    // check no duplicate mentions are added.
    Set<IntPair> mentionSpans = new HashSet<>();
    for (Constituent c : cView.getConstituents()) {
        IntPair cSpan = c.getSpan();
        assertFalse(mentionSpans.contains(cSpan));
        mentionSpans.add(cSpan);
    }
    System.out.println(TextAnnotationPrintHelper.printCoreferenceView(cView));
    if (doSerialize) {
        String jsonStr = SerializationHelper.serializeToJson(output);
        try {
            LineIO.write("EREsample.json", Collections.singletonList(jsonStr));
        } catch (IOException e) {
            e.printStackTrace();
            fail(e.getMessage());
        }
        TextAnnotation newTa = null;
        try {
            newTa = SerializationHelper.deserializeFromJson(jsonStr);
        } catch (Exception e) {
            e.printStackTrace();
            fail(e.getMessage());
        }
        assertNotNull(newTa);
    }
    System.out.println("Report: " + emr.generateReport());
    return outputXmlTa;
}
Also used : IOException(java.io.IOException) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) IOException(java.io.IOException) EREMentionRelationReader(edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREMentionRelationReader)

Aggregations

EREMentionRelationReader (edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREMentionRelationReader)3 IOException (java.io.IOException)3 IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)2 Counter (edu.illinois.cs.cogcomp.core.stats.Counter)1 StringTransformation (edu.illinois.cs.cogcomp.core.utilities.StringTransformation)1 ResourceManager (edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager)1 EREDocumentReader (edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREDocumentReader)1 EREEventReader (edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREEventReader)1 ERENerReader (edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.ERENerReader)1