Search in sources :

Example 81 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class MainClass method test.

private static void test(String modelPath, String testDataPath, boolean updateMatrix) throws Exception {
    SLModel model = SLModel.loadModel(modelPath);
    ((LabeledChuLiuEdmondsDecoder) model.infSolver).loadDepRelDict();
    SLProblem sp = getStructuredData(testDataPath, (LabeledChuLiuEdmondsDecoder) model.infSolver);
    double acc_undirected = 0.0;
    double acc_directed_unlabeled = 0.0;
    double acc_labeled = 0.0;
    double total = 0.0;
    long totalTime = 0L;
    int totalLength = 0;
    for (int i = 0; i < sp.instanceList.size(); i++) {
        DepInst sent = (DepInst) sp.instanceList.get(i);
        totalLength += sent.size();
        DepStruct gold = (DepStruct) sp.goldStructureList.get(i);
        long startTime = System.currentTimeMillis();
        DepStruct prediction = (DepStruct) model.infSolver.getBestStructure(model.wv, sent);
        totalTime += (System.currentTimeMillis() - startTime);
        IntPair tmp_undirected = evaluate(sent, gold, prediction, false, false, false);
        IntPair tmp_directed_unlabeled = evaluate(sent, gold, prediction, true, false, false);
        IntPair tmp_labeled = evaluate(sent, gold, prediction, true, true, updateMatrix);
        acc_undirected += tmp_undirected.getFirst();
        acc_directed_unlabeled += tmp_directed_unlabeled.getFirst();
        acc_labeled += tmp_labeled.getFirst();
        total += tmp_directed_unlabeled.getSecond();
    }
    System.out.println("Parsing time taken for " + sp.size() + " sentences with average length " + totalLength / sp.size() + ": " + totalTime);
    System.out.println("Average parsing time " + totalTime / sp.size());
    System.out.println("undirected acc " + acc_undirected);
    System.out.println("directed unlabeled acc " + acc_directed_unlabeled);
    System.out.println("labeled acc " + acc_labeled);
    System.out.println("total " + total);
    System.out.println("%age correct undirected " + (acc_undirected * 1.0 / total));
    System.out.println("%age correct directed & unlabeled " + (acc_directed_unlabeled * 1.0 / total));
    System.out.println("%age correct labeled " + (acc_labeled * 1.0 / total));
    if (updateMatrix)
        printMatrix();
    System.out.println("Done with testing!");
}
Also used : LabeledChuLiuEdmondsDecoder(edu.illinois.cs.cogcomp.depparse.core.LabeledChuLiuEdmondsDecoder) DepInst(edu.illinois.cs.cogcomp.depparse.core.DepInst) DepStruct(edu.illinois.cs.cogcomp.depparse.core.DepStruct) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair)

Example 82 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class ERENerReader method readFiller.

/**
     * WARNING: filler can have null value.
     *
     * @param fillerNode
     * @param view
     */
private void readFiller(Node fillerNode, View view, XmlTextAnnotation xmlTa) throws XMLException {
    NamedNodeMap nnMap = fillerNode.getAttributes();
    String fillerId = nnMap.getNamedItem(ID).getNodeValue();
    int offset = Integer.parseInt(nnMap.getNamedItem(OFFSET).getNodeValue());
    int length = Integer.parseInt(nnMap.getNamedItem(LENGTH).getNodeValue());
    String fillerForm = SimpleXMLParser.getContentString((Element) fillerNode);
    String fillerType = nnMap.getNamedItem(TYPE).getNodeValue();
    if (null == fillerForm || "".equals(fillerForm))
        throw new IllegalStateException("ERROR: did not find surface form for filler " + nnMap.getNamedItem(ID).getNodeValue());
    IntPair offsets = getTokenOffsets(offset, offset + length, fillerForm, xmlTa);
    if (null != offsets) {
        if (-1 == offsets.getFirst() || -1 == offsets.getSecond()) {
            String xmlStr = xmlTa.getXmlSt().getOrigText();
            int fillerWindowMin = Math.max(offset - 100, 0);
            int fillerWindowMax = Math.min(offset + 100, xmlStr.length());
            String fillerInfo = "filler form: " + fillerForm + "; orig xml offsets: " + offset + ", " + (offset + length) + "; context: '" + xmlStr.substring(fillerWindowMin, fillerWindowMax) + "'\n";
            logger.warn("Couldn't find filler mention in clean text: {}", fillerInfo);
            // look in markup...
            boolean isFillerFound = recordNullMentionInfo(fillerId, fillerId, "FILLER", fillerNode, true);
            if (!isFillerFound)
                logger.warn("ERROR: could not find text/xml markup corresponding to filler." + "Since filler should not be an entity, EITHER it was in a quoted span, and therefore " + "should not have been annotated, or it's in a deleted span that should not have been deleted (check" + " EREDocumentReader's use of XmlDocumentProcessor; were the right tags provided at construction?), " + "OR it is from xml markup and the offsets are incorrect (attempted retrieval allowed for +/- 1 char)\n" + "filler info: " + fillerInfo);
        //                logger.warn("could not create filler with id '{}'", nnMap.getNamedItem(ID)
        //                        .getNodeValue());
        } else {
            //filler found...
            if (offsets.getSecond() < offsets.getFirst())
                throw new IllegalStateException("for filler " + fillerId + ", second offset is less than first " + "(first, second:" + offsets.getFirst() + "," + offsets.getSecond() + ").");
            Constituent fillerConstituent = new Constituent(fillerType, view.getViewName(), view.getTextAnnotation(), offsets.getFirst(), offsets.getSecond() + 1);
            fillerConstituent.addAttribute(EntityMentionIdAttribute, fillerId);
            fillerConstituent.addAttribute(EntityMentionTypeAttribute, FILL);
            view.addConstituent(fillerConstituent);
            mentionIdToConstituent.put(fillerId, fillerConstituent);
        }
    }
}
Also used : IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair)

Example 83 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class EREReaderTest method runRelationReader.

private static XmlTextAnnotation runRelationReader(String corpusDir, String wantedId) {
    EREMentionRelationReader emr = null;
    try {
        boolean throwExceptionOnXmlTagMismatch = true;
        emr = new EREMentionRelationReader(EreCorpus.ENR3, corpusDir, throwExceptionOnXmlTagMismatch);
    } catch (Exception e) {
        e.printStackTrace();
        System.exit(-1);
    }
    assert (emr.hasNext());
    String posterId = "TheOldSchool";
    XmlTextAnnotation outputXmlTa = null;
    do {
        outputXmlTa = emr.next();
    } while (!outputXmlTa.getTextAnnotation().getId().equals(wantedId) && emr.hasNext());
    if (!outputXmlTa.getTextAnnotation().getId().equals(wantedId))
        fail("ERROR: didn't find corpus entry with id '" + wantedId + "'.");
    TextAnnotation output = outputXmlTa.getTextAnnotation();
    assert (output.hasView(ViewNames.MENTION_ERE));
    View nerRelation = output.getView(ViewNames.MENTION_ERE);
    assert (nerRelation.getConstituents().size() > 0);
    System.out.println("EREMentionRelationReader found " + nerRelation.getRelations().size() + " relations: ");
    for (Relation r : nerRelation.getRelations()) System.out.println(TextAnnotationPrintHelper.printRelation(r));
    String relValue = nerRelation.getRelations().get(0).toString();
    assertEquals(RELVALUE, relValue);
    System.out.println(TextAnnotationPrintHelper.OUTPUT_SEPARATOR);
    System.out.println("ERE Coreference chains:");
    assert (output.hasView(ViewNames.COREF_ERE));
    CoreferenceView cView = (CoreferenceView) output.getView(ViewNames.COREF_ERE);
    assert (cView.getConstituents().size() > 0);
    // check no duplicate mentions are added.
    Set<IntPair> mentionSpans = new HashSet<>();
    for (Constituent c : cView.getConstituents()) {
        IntPair cSpan = c.getSpan();
        assertFalse(mentionSpans.contains(cSpan));
        mentionSpans.add(cSpan);
    }
    System.out.println(TextAnnotationPrintHelper.printCoreferenceView(cView));
    if (doSerialize) {
        String jsonStr = SerializationHelper.serializeToJson(output);
        try {
            LineIO.write("EREsample.json", Collections.singletonList(jsonStr));
        } catch (IOException e) {
            e.printStackTrace();
            fail(e.getMessage());
        }
        TextAnnotation newTa = null;
        try {
            newTa = SerializationHelper.deserializeFromJson(jsonStr);
        } catch (Exception e) {
            e.printStackTrace();
            fail(e.getMessage());
        }
        assertNotNull(newTa);
    }
    System.out.println("Report: " + emr.generateReport());
    return outputXmlTa;
}
Also used : IOException(java.io.IOException) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) IOException(java.io.IOException) EREMentionRelationReader(edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREMentionRelationReader)

Example 84 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class EREReaderTest method main.

//
//            "/shared/corpora/corporaWeb/deft/eng/LDC2016E31_DEFT_Rich_ERE_English_Training_Annotation_R3/" +
//                    "data/source/ENG_DF_001241_20150407_F0000007T.xml";
// public void testNerReader() {
/**
     * there are THREE ERE English releases.
     * Regrettably, they do not follow consistent standards for organization or for annotation.
     *
     * LDC2015E29_DEFT_Rich_ERE English V2 has two sets of annotation files: one, used for the Event Argument Extraction
     *    task in TAC that year, includes a small amount of additional markup to make each xml document well-formed.
     *    This changes the annotation offsets. Taggable entities within quoted blocks are annotated.
     *
     * LDC2015E68_DEFT_Rich_ERE_English R2_V2 has as source files excerpts from multi-post discussion forum documents.
     * Taggable entities within quoted blocks are annotated.
     *
     * LDC2016E31_DEFT_Rich_ERE_English ENR3 has -- I believe -- complete threads, where annotation files may be
     *    broken into several chunks. Taggable entities within quoted blocks are NOT marked.
     *
     * There are two Spanish and two Chinese ERE releases (aside from a parallel English-Chinese release).
     * Spanish/Chinese release 1 have the same characteristics as English release 2.
     * Spanish/Chinese release 2 have the same characteristics as English release 3.
     * @param args
     */
public static void main(String[] args) {
    /*
         * ERE documents in release 2015E29: mainly newswire, some discussion format.
         * This test uses the Event Argument Extraction version of the data, as this includes xml markup that makes
         * the source files well-formed, and we are likely to need this reader for TAC EAE tasks. Moreover, the later
         * ERE release uses this format.
         */
    String corpusDir = "/shared/corpora/corporaWeb/deft/eng/LDC2015E29_DEFT_Rich_ERE_English_Training_Annotation_V2/data/";
    XmlTextAnnotation outputXmlTa = runTest(EreCorpus.ENR1, corpusDir);
    corpusDir = "/shared/corpora/corporaWeb/deft/eng/LDC2015E68_DEFT_Rich_ERE_English_Training_Annotation_R2_V2/data/";
    outputXmlTa = runTest(EreCorpus.ENR2, corpusDir);
    corpusDir = "/shared/corpora/corporaWeb/deft/eng/LDC2016E31_DEFT_Rich_ERE_English_Training_Annotation_R3/data/";
    outputXmlTa = runTest(EreCorpus.ENR3, corpusDir);
    System.out.println("Testing EREMentionRelationReader...");
    StringTransformation xmlSt = outputXmlTa.getXmlSt();
    String origXml = xmlSt.getOrigText();
    List<XmlDocumentProcessor.SpanInfo> markup = outputXmlTa.getXmlMarkup();
    Map<IntPair, XmlDocumentProcessor.SpanInfo> markupInfo = XmlDocumentProcessor.compileOffsetSpanMapping(markup);
    Map<IntPair, Set<String>> markupAttributes = XmlDocumentProcessor.compileAttributeValues(markup);
    Set<String> dateTimeReported = markupAttributes.get(DATETIMEOFFSETS);
    assert (dateTimeReported.contains(DATETIMEVAL));
    assertEquals(DATETIMEVAL, origXml.substring(DATETIMEOFFSETS.getFirst(), DATETIMEOFFSETS.getSecond()));
    //        private static final String ORIGAUTHVAL = "tinydancer";
    //        private static final IntPair ORIGAUTHOFFSETS = new IntPair(2943, 2953);
    Set<String> origAuth = markupAttributes.get(ORIGAUTHOFFSETS);
    assert (origAuth.contains(ORIGAUTHVAL));
    assertEquals(ORIGAUTHVAL, origXml.substring(ORIGAUTHOFFSETS.getFirst(), ORIGAUTHOFFSETS.getSecond()));
    Set<String> auth = markupAttributes.get(AUTHOROFFSETS);
    assert (auth.contains(AUTHORVAL));
    assertEquals(AUTHORVAL, origXml.substring(AUTHOROFFSETS.getFirst(), AUTHOROFFSETS.getSecond()));
    /*
         * other values recorded at same offsets are not required to be mapped to xml document char offsets.
         * Since this value is not retained in the cleaned text, there is NO CORRESPONDING CONSTITUENT.
         */
    XmlDocumentProcessor.SpanInfo postSpan = markupInfo.get(POSTOFFSETS);
    String mid = postSpan.attributes.get(ENTITY_MENTION_ID).getFirst();
    assertEquals(MENTION_ID_VAL, mid);
    String nt = markupInfo.get(POSTOFFSETS).attributes.get(NOUN_TYPE).getFirst();
    assertEquals(NOUN_TYPE_VAL, nt);
    String eid = markupInfo.get(POSTOFFSETS).attributes.get(ENTITY_ID).getFirst();
    assertEquals(ENTITY_ID_VAL, eid);
    String spec = markupInfo.get(POSTOFFSETS).attributes.get(SPECIFICITY).getFirst();
    assertEquals(SPECIFICITY_VAL, spec);
    assertEquals(QUOTE, markupInfo.get(QUOTEOFFSETS).label);
    String quoteStr = origXml.substring(QUOTEOFFSETS.getFirst(), QUOTEOFFSETS.getSecond());
    assertEquals(QUOTE_VAL, quoteStr);
    String wantedId = "ENG_DF_000170_20150322_F00000082.xml";
    runRelationReader(corpusDir, wantedId);
    wantedId = "ENG_DF_000170_20150322_F00000082.xml";
    runEventReader(corpusDir, wantedId);
    corpusDir = "/shared/corpora/corporaWeb/deft/event/LDC2016E73_TAC_KBP_2016_Eval_Core_Set_Rich_ERE_Annotation_with_Augmented_Event_Argument_v2/data/eng/nw";
    String newWantedId = "ENG_NW_001278_20131206_F00011WGK.xml";
    XmlTextAnnotation xmlTa = runEventReader(corpusDir, newWantedId);
}
Also used : StringTransformation(edu.illinois.cs.cogcomp.core.utilities.StringTransformation) XmlDocumentProcessor(edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair)

Example 85 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class ERENerReader method getMention.

private Constituent getMention(Node mentionNode, String label, View view, XmlTextAnnotation xmlTa) throws XMLException {
    Constituent mentionConstituent = null;
    NamedNodeMap nnMap = mentionNode.getAttributes();
    String noun_type = nnMap.getNamedItem(NOUN_TYPE).getNodeValue();
    String mId = nnMap.getNamedItem(ID).getNodeValue();
    if (noun_type.equals(PRO) || noun_type.equals(NOM)) {
        if (!addNominalMentions)
            return null;
    }
    /*
         * update this count here to avoid creating discrepancy in file count vs created count if
         * user does not add nominal mentions
         */
    numMentionsInSource++;
    // we have a valid mention(a "NAM" or a "NOM"), add it to our view.
    /*
         * expect one child
         */
    NodeList mnl = ((Element) mentionNode).getElementsByTagName(MENTION_TEXT);
    String mentionForm = null;
    if (mnl.getLength() > 0) {
        mentionForm = SimpleXMLParser.getContentString((Element) mnl.item(0));
    } else {
        logger.error("No surface form found for mention with id {}.", mId);
        return null;
    }
    int offset = Integer.parseInt(nnMap.getNamedItem(OFFSET).getNodeValue());
    int length = Integer.parseInt(nnMap.getNamedItem(LENGTH).getNodeValue());
    IntPair offsets = getTokenOffsets(offset, offset + length, mentionForm, xmlTa);
    if (null == offsets)
        return null;
    else if (-1 == offsets.getFirst() && -1 == offsets.getSecond()) {
        // handled by next layer up, which records the info separately
        return null;
    }
    String headForm = null;
    IntPair headTokenOffsets = null;
    mnl = ((Element) mentionNode).getElementsByTagName(MENTION_HEAD);
    if (mnl.getLength() > 0) {
        Node headNode = mnl.item(0);
        nnMap = mentionNode.getAttributes();
        headForm = headNode.getNodeValue();
        int headStart = Integer.parseInt(nnMap.getNamedItem(OFFSET).getNodeValue());
        int headLength = Integer.parseInt(nnMap.getNamedItem(LENGTH).getNodeValue());
        headTokenOffsets = getTokenOffsets(headStart, headStart + headLength, headForm, xmlTa);
    }
    if (null == headTokenOffsets)
        headTokenOffsets = offsets;
    IntPair headCharOffsets = getCharacterOffsets(headTokenOffsets.getFirst(), headTokenOffsets.getSecond());
    try {
        mentionConstituent = new Constituent(label, view.getViewName(), view.getTextAnnotation(), offsets.getFirst(), offsets.getSecond() + 1);
        mentionConstituent.addAttribute(EntityMentionTypeAttribute, noun_type);
        mentionConstituent.addAttribute(EntityMentionIdAttribute, mId);
        mentionConstituent.addAttribute(EntityHeadStartCharOffset, Integer.toString(headCharOffsets.getFirst()));
        mentionConstituent.addAttribute(EntityHeadEndCharOffset, Integer.toString(headCharOffsets.getSecond()));
        mentionIdToConstituent.put(mId, mentionConstituent);
    } catch (IllegalArgumentException iae) {
        numOverlaps++;
    }
    return mentionConstituent;
}
Also used : Node(org.w3c.dom.Node) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair)

Aggregations

IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)103 Pair (edu.illinois.cs.cogcomp.core.datastructures.Pair)32 Test (org.junit.Test)20 ArrayList (java.util.ArrayList)19 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)18 Constituent (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)14 StringTransformation (edu.illinois.cs.cogcomp.core.utilities.StringTransformation)13 XmlDocumentProcessor (edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor)6 Tree (edu.illinois.cs.cogcomp.core.datastructures.trees.Tree)5 SpanLabelView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView)4 Sentence (edu.illinois.cs.cogcomp.lbjava.nlp.Sentence)4 FileNotFoundException (java.io.FileNotFoundException)4 Matcher (java.util.regex.Matcher)4 View (edu.illinois.cs.cogcomp.core.datastructures.textannotation.View)3 SentenceSplitter (edu.illinois.cs.cogcomp.lbjava.nlp.SentenceSplitter)3 LinkedVector (edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector)3 TokenizerTextAnnotationBuilder (edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder)3 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)3 CoreLabel (edu.stanford.nlp.ling.CoreLabel)3 Annotation (edu.stanford.nlp.pipeline.Annotation)3