Search in sources :

Example 11 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class EREEventReader method createTrigger.

private Constituent createTrigger(Node eventMentionNode, XmlTextAnnotation xmlTa, String eventId) throws XMLException {
    /*  <trigger source="ENG_DF_001241_20150407_F0000007T" offset="179" length="5">trade</trigger> */
    NodeList nl = ((Element) eventMentionNode).getElementsByTagName(TRIGGER);
    Constituent trigger = null;
    if (nl.getLength() == 0)
        throw new IllegalStateException("Event " + eventId + " has no trigger element.");
    if (nl.getLength() > 1)
        throw new IllegalStateException("Event " + eventId + " has multiple trigger elements.");
    for (int i = 0; i < nl.getLength(); ++i) {
        Node eventTriggerNode = nl.item(i);
        String triggerForm = SimpleXMLParser.getContentString((Element) eventTriggerNode);
        NamedNodeMap nnMap = eventTriggerNode.getAttributes();
        String source = nnMap.getNamedItem(SOURCE).getNodeValue();
        int offset = Integer.parseInt(nnMap.getNamedItem(OFFSET).getNodeValue());
        int length = Integer.parseInt(nnMap.getNamedItem(LENGTH).getNodeValue());
        IntPair offsets = getTokenOffsets(offset, offset + length, triggerForm, xmlTa);
        if (null == offsets)
            return null;
        else if (-1 == offsets.getFirst() && -1 == offsets.getSecond()) {
            // handled by next layer up, which records the info separately
            return null;
        }
        trigger = new Constituent(TRIGGER, getEventViewName(), xmlTa.getTextAnnotation(), offsets.getFirst(), offsets.getSecond() + 1);
        trigger.addAttribute(EventIdAttribute, eventId);
        trigger.addAttribute(SOURCE, source);
    }
    return trigger;
}
Also used : IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair)

Example 12 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class ERENerReader method findAndUpdateMentionInfo.

/**
 * mention length should be the same, but in at least one file many offsets are shifted by +1
 * allow also -1 shift
 */
private XmlDocumentProcessor.SpanInfo findAndUpdateMentionInfo(IntPair origOffsets, String nounType, String label, String eId, String mId, String specificity) {
    XmlDocumentProcessor.SpanInfo mentionInfo = offsetToSpanInfo.get(origOffsets);
    if (null == mentionInfo)
        mentionInfo = offsetToSpanInfo.get(new IntPair(origOffsets.getFirst() - 1, origOffsets.getSecond() - 1));
    if (null == mentionInfo)
        mentionInfo = offsetToSpanInfo.get(new IntPair(origOffsets.getFirst() + 1, origOffsets.getSecond() + 1));
    if (null != mentionInfo) {
        mentionInfo.attributes.put(ENTITY_ID, new Pair(eId, origOffsets));
        mentionInfo.attributes.put(ENTITY_MENTION_ID, new Pair(mId, origOffsets));
        mentionInfo.attributes.put(SPECIFICITY, new Pair(specificity, origOffsets));
        mentionInfo.attributes.put(NOUN_TYPE, new Pair(nounType, origOffsets));
        mentionInfo.attributes.put(IS_FOUND, new Pair(Boolean.toString(null != mentionInfo), origOffsets));
        if (null != label)
            mentionInfo.attributes.put(TYPE, new Pair(label, origOffsets));
    }
    return mentionInfo;
}
Also used : XmlDocumentProcessor(edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair)

Example 13 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class ERENerReader method getTokenOffsets.

/**
 * find the token offsets in the TextAnnotation that correspond to the source character offsets for the given
 * mention
 *
 * @param origStartOffset start character offset from xml markup
 * @param origEndOffset   end character offset from xml markup
 * @param mentionForm     mention form from xml markup
 * @param xmlTa           XmlTextAnnotation object storing original xml, transformed text, extracted xml markup,
 *                        and corresponding TextAnnotation
 * @return Intpair(-1, -1) if the specified offsets correspond to deleted span (and hence likely a name mention
 * in xml metadata, e.g. post author); null if no mapped tokens could be found (possibly, indexes refer
 * to the middle of a single token because tokenizer can't segment some strings); or the corresponding
 * token indexes
 */
protected IntPair getTokenOffsets(int origStartOffset, int origEndOffset, String mentionForm, XmlTextAnnotation xmlTa) {
    StringTransformation st = xmlTa.getXmlSt();
    String origStr = st.getOrigText().substring(origStartOffset, origEndOffset);
    if (origStr.startsWith(" ") || origStr.startsWith("\n")) {
        origStartOffset += 1;
        origEndOffset += 1;
    }
    int adjStart = st.computeModifiedOffsetFromOriginal(origStartOffset);
    int adjEnd = st.computeModifiedOffsetFromOriginal(origEndOffset);
    if (adjStart == adjEnd) {
        // probably, maps to span deleted when creating cleaned-up text
        return new IntPair(-1, -1);
    }
    IntPair returnOffset = null;
    int si = 0, ei = 0;
    TextAnnotation ta = xmlTa.getTextAnnotation();
    String rawText = ta.getText();
    String rawStr = rawText.substring(adjStart, adjEnd);
    logger.debug("source xml str: '" + origStr + "' (" + origStartOffset + "," + origEndOffset + ")");
    try {
        si = findStartIndex(adjStart);
        ei = findEndIndex(adjEnd, rawText);
        returnOffset = new IntPair(si, ei);
    } catch (IllegalArgumentException iae) {
        logger.error("could not find token offsets for mention form '" + mentionForm + ", start, end orig: (" + origStartOffset + "," + origEndOffset + "); adjusted: (" + adjStart + "," + adjEnd + ").");
        System.exit(1);
    } catch (RuntimeException re) {
        numOffsetErrors++;
        logger.error("Error finding text for '{}' at offsets {}:", rawStr, (adjStart + "-" + adjEnd));
        boolean siwaszero = false;
        if (si == 0) {
            siwaszero = true;
        }
        si = findStartIndexIgnoreError(adjStart);
        ei = findEndIndexIgnoreError(adjEnd);
        if (siwaszero)
            logger.error("Could not find start token : text='" + mentionForm + "' at adjusted offsets " + adjStart + " to " + adjEnd);
        else
            logger.error("Could not find end token : text='" + mentionForm + "' at adjusted offsets " + adjStart + " to " + adjEnd);
        int max = ta.getTokens().length;
        int start = si >= 2 ? si - 2 : 0;
        int end = (ei + 2) < max ? ei + 2 : max;
        StringBuilder bldr = new StringBuilder();
        for (int jj = start; jj < end; jj++) {
            bldr.append(" ");
            if (jj == si)
                bldr.append(":");
            bldr.append(ta.getToken(jj));
            if (jj == ei)
                bldr.append(":");
            bldr.append(" ");
        }
        bldr.append("\n");
        logger.error(bldr.toString());
    }
    return returnOffset;
}
Also used : StringTransformation(edu.illinois.cs.cogcomp.core.utilities.StringTransformation) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair)

Example 14 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class ERENerReader method recordNullMentionInfo.

/**
 * for a mention that could not be mapped to a set of tokens in the cleaned text, record the information
 * to allow use of information by downstream systems in the XmlTextAnnotation object associated with the
 * source xml.  This means finding the original span that contains the mention (expected to be an attribute)
 * and updating its attributes with additional mention information.
 *
 * @param label       label for entity
 * @param eId         entity id
 * @param specificity entity specificity
 * @param mentionNode mention markup
 */
private boolean recordNullMentionInfo(String label, String eId, String specificity, Node mentionNode, boolean isFiller) throws XMLException {
    NamedNodeMap nnMap = mentionNode.getAttributes();
    String mId = nnMap.getNamedItem(ID).getNodeValue();
    String nounType = "NONE";
    if (!isFiller)
        nounType = nnMap.getNamedItem(NOUN_TYPE).getNodeValue();
    /*
         * expect one child
         */
    String mentionForm = null;
    if (isFiller)
        mentionForm = mentionNode.getTextContent();
    else {
        NodeList mnl = ((Element) mentionNode).getElementsByTagName(MENTION_TEXT);
        if (mnl.getLength() > 0) {
            mentionForm = SimpleXMLParser.getContentString((Element) mnl.item(0));
        } else {
            logger.error("No surface form found for mention with id {}.", mId);
        }
    }
    int offset = Integer.parseInt(nnMap.getNamedItem(OFFSET).getNodeValue());
    int length = Integer.parseInt(nnMap.getNamedItem(LENGTH).getNodeValue());
    IntPair origOffsets = new IntPair(offset, offset + length);
    // boolean isFound = spanInfo.containsKey(origOffsets);
    XmlDocumentProcessor.SpanInfo mentionInfo = findAndUpdateMentionInfo(origOffsets, nounType, label, eId, mId, specificity);
    boolean isFound = true;
    if (null == mentionInfo) {
        isFound = false;
        logger.warn("even with shifted indexes, could not find offset pair (" + origOffsets.getFirst() + "," + (origOffsets.getSecond()) + ") in xml markup info " + "in XmlTextAnnotation. Entity id, mention id, label, form are: " + eId + "," + mId + ", " + label + "," + mentionForm + ".");
    }
    if (isFound)
        // ...and so excluded from cleaned-up text.
        numXmlMarkupMentionsGenerated++;
    if (!isFound)
        logger.warn("could not find offset pair (" + origOffsets.getFirst() + "," + (origOffsets.getSecond()) + ") in xml markup info " + "in XmlTextAnnotation.  Trying shifted indexes.");
    return isFound;
}
Also used : XmlDocumentProcessor(edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair)

Example 15 with IntPair

use of edu.illinois.cs.cogcomp.core.datastructures.IntPair in project cogcomp-nlp by CogComp.

the class NombankFields method createPredicate.

@Override
public Constituent createPredicate(TextAnnotation ta, String viewName, List<Tree<Pair<String, IntPair>>> yield) {
    Tree<Pair<String, IntPair>> l = yield.get(predicateTerminal);
    int start = l.getLabel().getSecond().getFirst();
    Constituent predicate = new Constituent("Predicate", viewName, ta, start, start + 1);
    predicate.addAttribute(PropbankReader.LemmaIdentifier, lemma);
    predicate.addAttribute(PropbankReader.SenseIdentifier, sense);
    return predicate;
}
Also used : Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent) IntPair(edu.illinois.cs.cogcomp.core.datastructures.IntPair) Pair(edu.illinois.cs.cogcomp.core.datastructures.Pair)

Aggregations

IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)129 Pair (edu.illinois.cs.cogcomp.core.datastructures.Pair)39 ArrayList (java.util.ArrayList)27 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)26 Test (org.junit.Test)21 Constituent (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)18 StringTransformation (edu.illinois.cs.cogcomp.core.utilities.StringTransformation)14 XmlDocumentProcessor (edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor)8 SpanLabelView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView)7 Tree (edu.illinois.cs.cogcomp.core.datastructures.trees.Tree)6 TokenizerTextAnnotationBuilder (edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder)5 Matcher (java.util.regex.Matcher)5 View (edu.illinois.cs.cogcomp.core.datastructures.textannotation.View)4 XmlTextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation)4 Sentence (edu.illinois.cs.cogcomp.lbjava.nlp.Sentence)4 FileNotFoundException (java.io.FileNotFoundException)4 IOException (java.io.IOException)4 JsonObject (com.google.gson.JsonObject)3 TextAnnotationBuilder (edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder)3 PredicateArgumentView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.PredicateArgumentView)3